alxmrs · alxmrs · Mar 10, 2024 · Mar 10, 2024 · Mar 10, 2024 · Mar 10, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,60 @@
+# Taken from Xee and minimally modified: https://github.com/google/Xee/blob/main/.github/workflows/ci-build.yml
+#
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+name: ci
+
+on:
+  # Triggers the workflow on push or pull request events but only for the main branch
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+jobs:
+  build:
+    name: "python ${{ matrix.python-version }} tests"
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [
+          "3.8",
+          "3.9",
+          "3.10",
+          "3.11",
+          "3.12",
+        ]
+    steps:
+    - name: Cancel previous
+      uses: styfle/[email protected]
+      with:
+        access_token: ${{ github.token }}
+      if: ${{github.ref != 'refs/head/main'}}
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python-version }}
+        cache: 'pip'
+    - name: Install qarray
+      run: |
+        pip install -e .[test]
+    - uses: 'actions/checkout@v4'
+    - name: Run unit tests
+      run: |
+        pytest qarray
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -0,0 +1,52 @@
+# Taken from Xee and minimally modified: https://github.com/google/Xee/blob/main/.github/workflows/lint.yml
+#
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+name: lint
+
+on:
+  # Triggers the workflow on push or pull request events but only for the main branch
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+jobs:
+  build:
+    name: "python ${{ matrix.python-version }} lint"
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+    steps:
+    - name: Cancel previous
+      uses: styfle/[email protected]
+      with:
+        access_token: ${{ github.token }}
+      if: ${{github.ref != 'refs/head/main'}}
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install linter
+      run: |
+        pip install pyink
+    - name: Lint with pyink
+      run: |
+        pyink --check .
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -0,0 +1,97 @@
+# Taken from Xee and minimally modified: https://github.com/google/Xee/blob/main/.github/workflows/publish.yml
+#
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: Publish to PyPi
+
+on:
+  release:
+    types: [published]
+
+jobs:
+  build-artifacts:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python
+        uses: actions/[email protected]
+        with:
+          python-version: 3.9
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install setuptools setuptools-scm wheel twine check-manifest
+
+      - name: Build tarball and wheels
+        run: |
+          git clean -xdf
+          git restore -SW .
+          python -m build --sdist --wheel .
+      - name: Check built artifacts
+        run: |
+          python -m twine check dist/*
+          pwd
+      - uses: actions/upload-artifact@v2
+        with:
+          name: releases
+          path: dist
+
+  test-built-dist:
+    needs: build-artifacts
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/[email protected]
+        name: Install Python
+        with:
+          python-version: 3.9
+      - uses: actions/download-artifact@v2
+        with:
+          name: releases
+          path: dist
+      - name: List contents of built dist
+        run: |
+          ls -ltrh
+          ls -ltrh dist
+      - name: Publish package to TestPyPI
+        if: github.event_name == 'push'
+        uses: pypa/[email protected]
+        with:
+          user: __token__
+          password: ${{ secrets.TESTPYPI_TOKEN }}
+          repository_url: https://test.pypi.org/legacy/
+          verbose: true
+
+      - name: Check uploaded package
+        if: github.event_name == 'push'
+        run: |
+          sleep 3
+          python -m pip install --upgrade pip
+          python -m pip install --extra-index-url https://test.pypi.org/simple --upgrade de
+          python -c "import qarray; print(qarray.__version__)"
+  upload-to-pypi:
+    needs: test-built-dist
+    if: github.event_name == 'release'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/download-artifact@v2
+        with:
+          name: releases
+          path: dist
+      - name: Publish package to PyPI
+        uses: pypa/[email protected]
+        with:
+          user: __token__
+          password: ${{ secrets.PYPI_TOKEN }}
+          verbose: true
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -0,0 +1,22 @@
+# Contributing Guide
+
+## Where to start?
+
+Please check out the [issues tab](https://github.com/alxmrs/qarray/issues).
+Let's have a discussion over there before proceeding with any changes. Great
+minds think alike -- someone may have already created an issue related to your
+inquiry. If there's a bug, please let us know.
+
+If you're totally new to open source development, I recommend
+reading [Xarray's contributing guide](https://docs.xarray.dev/en/stable/contributing.html)
+.
+
+## Developer setup
+
+0. (Recommended) Create a project-specific python
+   environment. [(mini)Conda](https://docs.anaconda.com/free/miniconda/index.html)
+   or [Mamba](https://mamba.readthedocs.io/en/latest/)
+   is preferred.
+1. Clone the repository (bonus: [via SSH](https://docs.github.com/en/authentication/connecting-to-github-with-ssh/adding-a-new-ssh-key-to-your-github-account))
+   and `cd qarray` (the project root).
+1. Install dev dependencies via: `pip install -e ".[dev]` 
diff --git a/README.md b/README.md
@@ -33,21 +33,21 @@ df = qr.read_xarray(ds)
 df.head()
 ```
 
-Succinctly, we "pivot" Xarray Datasets to treat them like
-tables so we can run SQL queries against them.
+Succinctly, we "pivot" Xarray Datasets to treat them like tables so we can run
+SQL queries against them.
 
 ## Why build this?
 
 A few reasons:
 
 * Even though SQL is the lingua franca of data, scientific datasets are often
   inaccessible to non-scientists.
-* Joining tabular data with raster data is common yet difficult. It
-  could be easy.
-* There are many cloud-native, Xarray-openable datasets, 
+* Joining tabular data with raster data is common yet difficult. It could be
+  easy.
+* There are many cloud-native, Xarray-openable datasets,
   from [Google Earth Engine](https://github.com/google/Xee)
   to [Pangeo Forge](https://pangeo-forge.org/). Wouldn’t it be great if these
-  were also SQL-accessible? How can the bridge be built with minimal effort? 
+  were also SQL-accessible? How can the bridge be built with minimal effort?
 
 This is a light-weight way to prove the value of the interface.
 
@@ -63,14 +63,14 @@ That's it!
 
 ## Why does this work?
 
-Underneath Xarray, Dask, and Pandas, there are NumPy arrays. These are
-paged in chucks and represented contiguously in memory. It is only a 
-matter of metadata that breaks them up into ndarrays. `to_dataframe()`
-just changes this metadata (via a `ravel()`/`reshape()`), back into a
-column amenable to a DataFrame. 
+Underneath Xarray, Dask, and Pandas, there are NumPy arrays. These are paged in
+chucks and represented contiguously in memory. It is only a matter of metadata
+that breaks them up into ndarrays. `to_dataframe()`
+just changes this metadata (via a `ravel()`/`reshape()`), back into a column
+amenable to a DataFrame.
 
-There is added overhead from duplicating dimensions as columns, which
-we see as worth the convenience of DataFrames. 
+There is added overhead from duplicating dimensions as columns, which we see as
+worth the convenience of DataFrames.
 
 ## What are the current limitations?
 
@@ -87,10 +87,36 @@ supported ([geopandas/dask-geopandas#72](https://github.com/geopandas/dask-geopa
 ## What would a deeper integration look like?
 
 I have a few ideas so far. One approach involves applying operations directly on
-Xarray Datasets. This approach is being pursued 
+Xarray Datasets. This approach is being pursued
 [here](https://github.com/google/weather-tools/tree/main/xql), as `xql`.
 
-Deeper still: I was thinking we could make a [virtual](https://fsspec.github.io/kerchunk/)
+Deeper still: I was thinking we could make
+a [virtual](https://fsspec.github.io/kerchunk/)
 filesystem for parquet that would internally map to Zarr. Raster-backed virtual
 parquet would open up integrations to numeroustools like dask, pyarrow, duckdb,
-and BigQuery. More thoughts on this in [#4](https://github.com/alxmrs/qarray/issues/4).
+and BigQuery. More thoughts on this
+in [#4](https://github.com/alxmrs/qarray/issues/4).
+
+## License
+
+```
+Copyright 2024 Alexander Merose
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+```
+
+Some sources are re-distributed from Google LLC
+via https://github.com/google/Xee (also Apache-2.0 License) with and without
+modification. These files are subject to the original copyright; they include
+the original license header comment as well as a note to indicate
+modifications (when appropriate).
diff --git a/perf_tests/compute_air.py b/perf_tests/compute_air.py
@@ -10,4 +10,4 @@
 
   df = qr.read_xarray(air).compute()
 
-  print(len(df))
+  print(len(df))
diff --git a/perf_tests/groupby_air.py b/perf_tests/groupby_air.py
@@ -10,25 +10,27 @@
   chunks = {'time': 240, 'lat': 5, 'lon': 7}
   air = air.chunk(chunks)
   air_small = air.isel(
-    time=slice(0, 12), lat=slice(0, 11), lon=slice(0, 10)
+      time=slice(0, 12), lat=slice(0, 11), lon=slice(0, 10)
   ).chunk(chunks)
 
   df = qr.read_xarray(air_small)
 
   c = Context()
   c.create_table('air', df)
 
-  query = c.sql('''
+  query = c.sql("""
       SELECT
         "lat", "lon", SUM("air") as air_total
       FROM 
         "air" 
       GROUP BY
        "lat", "lon"
-      ''')
+      """)
 
   result = query.compute()
 
   expected = air_small.dims['lat'] * air_small.dims['lon']
-  assert len(result) == expected, f'Length must be {expected}, but was {len(result)}.'
+  assert (
+      len(result) == expected
+  ), f'Length must be {expected}, but was {len(result)}.'
   print(expected)
diff --git a/perf_tests/groupby_air_full.py b/perf_tests/groupby_air_full.py
@@ -15,17 +15,19 @@
   c = Context()
   c.create_table('air', df)
 
-  query = c.sql('''
+  query = c.sql("""
       SELECT
         "lat", "lon", SUM("air") as air_total
       FROM 
         "air" 
       GROUP BY
        "lat", "lon"
-      ''')
+      """)
 
   result = query.compute()
 
   expected = air.dims['lat'] * air.dims['lon']
-  assert len(result) == expected, f'Length must be {expected}, but was {len(result)}.'
+  assert (
+      len(result) == expected
+  ), f'Length must be {expected}, but was {len(result)}.'
   print(expected)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -10,4 +10,4 @@

		df = qr.read_xarray(air).compute()

		print(len(df))
		print(len(df))