[infra] Python 1.6.4 (#1188)

* feat: refactor update columns function in table.py * expose chunk_size parameter * fix: make staging data acessible * fix: make staging data acessible * fix: add parquet to storage options * pump version * fix: change bd_bdm_table_schema to new format * feat: test mergfy and pylint * fix: change spatial_coverage_tree to its own endpoint * feat: publish python-1.6.4 * Update table-approve.yml Co-authored-by: hellcassius <[email protected]> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
basedosdados · Jul 6, 2022 · 7eaa428 · 7eaa428
1 parent 00fd76f
commit 7eaa428
Show file tree

Hide file tree

Showing 9 changed files with 105 additions and 41 deletions.
diff --git a/.github/workflows/data-check.yml b/.github/workflows/data-check.yml
@@ -59,7 +59,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install basedosdados==1.6.4b5 pyarrow pytest toml
+          pip install basedosdados==1.6.4 pyarrow pytest toml
       - name: Set up base dos dados environment
         shell: bash
         env:

diff --git a/.github/workflows/metadata-validate.yml b/.github/workflows/metadata-validate.yml
@@ -36,7 +36,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install basedosdados==1.6.4b5 toml
+          pip install basedosdados==1.6.4 toml
       - name: Set up base dos dados environment
         run: python .github/workflows/env-setup/env_setup.py
         shell: bash

diff --git a/.github/workflows/table-approve.yml b/.github/workflows/table-approve.yml
@@ -36,7 +36,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install basedosdados==1.6.4b5 toml
+          pip install basedosdados==1.6.4 toml
       - name: Set up gcloud
         uses: google-github-actions/setup-gcloud@v0
         with:
@@ -116,7 +116,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install basedosdados==1.6.4b5 pyarrow pytest toml
+          pip install basedosdados==1.6.4 pyarrow pytest toml
       - name: Set up basedosdados environment
         run: |
           cd .github/workflows/env-setup

diff --git a/python-package/basedosdados/cli/cli.py b/python-package/basedosdados/cli/cli.py
@@ -280,6 +280,11 @@ def init_table(
     default=None,
     help="Location of dataset data. List of possible region names locations: https://cloud.google.com/bigquery/docs/locations",
 )
+@click.option(
+    "--chunk_size",
+    default=None,
+    help="The size of a chunk of data whenever iterating (in bytes). This must be a multiple of 256 KB per the API specification.",
+)
 @click.pass_context
 def create_table(
     ctx,
@@ -295,6 +300,7 @@ def create_table(
     columns_config_url_or_path,
     dataset_is_public,
     location,
+    chunk_size,
 ):
 
     Table(table_id=table_id, dataset_id=dataset_id, **ctx.obj).create(
@@ -308,6 +314,7 @@ def create_table(
         columns_config_url_or_path=columns_config_url_or_path,
         dataset_is_public=dataset_is_public,
         location=location,
+        chunk_size=chunk_size,
     )
 
     click.echo(
@@ -428,11 +435,21 @@ def delete_table(ctx, dataset_id, table_id, mode):
     default="raise",
     help="[raise|replace|pass] if file alread exists",
 )
+@click.option(
+    "--chunk_size",
+    default=None,
+    help="The size of a chunk of data whenever iterating (in bytes). This must be a multiple of 256 KB per the API specification.",
+)
 @click.pass_context
-def upload_table(ctx, dataset_id, table_id, filepath, partitions, if_exists):
+def upload_table(
+    ctx, dataset_id, table_id, filepath, partitions, if_exists, chunk_size
+):
 
     blob_name = Table(table_id=table_id, dataset_id=dataset_id, **ctx.obj).append(
-        filepath=filepath, partitions=partitions, if_exists=if_exists
+        filepath=filepath,
+        partitions=partitions,
+        if_exists=if_exists,
+        chunk_size=chunk_size,
     )
 
     click.echo(
@@ -493,12 +510,23 @@ def init_storage(ctx, bucket_name, replace, very_sure):
     default="raise",
     help="[raise|replace|pass] if file alread exists",
 )
+@click.option(
+    "--chunk_size",
+    default=None,
+    help="The size of a chunk of data whenever iterating (in bytes). This must be a multiple of 256 KB per the API specification.",
+)
 @click.pass_context
-def upload_storage(ctx, dataset_id, table_id, filepath, mode, partitions, if_exists):
+def upload_storage(
+    ctx, dataset_id, table_id, filepath, mode, partitions, if_exists, chunk_size
+):
 
     ctx.obj.pop("bucket_name")
     blob_name = Storage(dataset_id, table_id, **ctx.obj).upload(
-        filepath=filepath, mode=mode, partitions=partitions, if_exists=if_exists
+        filepath=filepath,
+        mode=mode,
+        partitions=partitions,
+        if_exists=if_exists,
+        chunk_size=chunk_size,
     )
 
     click.echo(

diff --git a/python-package/basedosdados/upload/dataset.py b/python-package/basedosdados/upload/dataset.py
@@ -120,28 +120,39 @@ def publicize(self, mode="all", dataset_is_public=True):
             dataset = m["client"].get_dataset(m["id"])
             entries = dataset.access_entries
             # TODO https://github.com/basedosdados/mais/pull/1020
-            if dataset_is_public and "staging" not in dataset.dataset_id:
-                entries.extend(
-                    [
-                        bigquery.AccessEntry(
-                            role="roles/bigquery.dataViewer",
-                            entity_type="iamMember",
-                            entity_id="allUsers",
-                        ),
-                        bigquery.AccessEntry(
-                            role="roles/bigquery.metadataViewer",
-                            entity_type="iamMember",
-                            entity_id="allUsers",
-                        ),
-                        bigquery.AccessEntry(
-                            role="roles/bigquery.user",
-                            entity_type="iamMember",
-                            entity_id="allUsers",
-                        ),
-                    ]
-                )
+            # TODO if staging dataset is private, the prod view can't acess it: if dataset_is_public and "staging" not in dataset.dataset_id:
+            if dataset_is_public:
+                if "staging" not in dataset.dataset_id:
+                    entries.extend(
+                        [
+                            bigquery.AccessEntry(
+                                role="roles/bigquery.dataViewer",
+                                entity_type="iamMember",
+                                entity_id="allUsers",
+                            ),
+                            bigquery.AccessEntry(
+                                role="roles/bigquery.metadataViewer",
+                                entity_type="iamMember",
+                                entity_id="allUsers",
+                            ),
+                            bigquery.AccessEntry(
+                                role="roles/bigquery.user",
+                                entity_type="iamMember",
+                                entity_id="allUsers",
+                            ),
+                        ]
+                    )
+                else:
+                    entries.extend(
+                        [
+                            bigquery.AccessEntry(
+                                role="roles/bigquery.dataViewer",
+                                entity_type="iamMember",
+                                entity_id="allUsers",
+                            ),
+                        ]
+                    )
                 dataset.access_entries = entries
-
             m["client"].update_dataset(dataset, ["access_entries"])
         logger.success(
             " {object} {object_id}_{mode} was {action}!",

diff --git a/python-package/basedosdados/upload/metadata.py b/python-package/basedosdados/upload/metadata.py
@@ -212,14 +212,10 @@ def metadata_schema(self) -> dict:
 
         if self.table_id:
             table_url = f"{self.CKAN_URL}/api/3/action/bd_bdm_table_schema"
-            table_schema = requests.get(table_url).json().get("result")
-
-            return table_schema
+            return requests.get(table_url).json().get("result")
 
         dataset_url = f"{self.CKAN_URL}/api/3/action/bd_dataset_schema"
-        dataset_schema = requests.get(dataset_url).json().get("result")
-
-        return dataset_schema
+        return requests.get(dataset_url).json().get("result")
 
     def exists_in_ckan(self) -> bool:
         """Check if Metadata object refers to an existing CKAN package or reso

diff --git a/python-package/basedosdados/upload/storage.py b/python-package/basedosdados/upload/storage.py
@@ -113,6 +113,7 @@ def upload(
         mode="all",
         partitions=None,
         if_exists="raise",
+        chunk_size=None,
         **upload_args,
     ):
         """Upload to storage at `<bucket_name>/<mode>/<dataset_id>/<table_id>`. You can:
@@ -158,6 +159,10 @@ def upload(
                 * 'raise' : Raises Conflict exception
                 * 'replace' : Replace table
                 * 'pass' : Do nothing
+            chunk_size (int): Optional
+                The size of a chunk of data whenever iterating (in bytes).
+                This must be a multiple of 256 KB per the API specification.
+                If not specified, the chunk_size of the blob itself is used. If that is not specified, a default value of 40 MB is used.
 
             upload_args ():
                 Extra arguments accepted by [`google.cloud.storage.blob.Blob.upload_from_file`](https://googleapis.dev/python/storage/latest/blobs.html?highlight=upload_from_filename#google.cloud.storage.blob.Blob.upload_from_filename)
@@ -169,7 +174,11 @@ def upload(
         path = Path(path)
 
         if path.is_dir():
-            paths = [f for f in path.glob("**/*") if f.is_file() and f.suffix == ".csv"]
+            paths = [
+                f
+                for f in path.glob("**/*")
+                if f.is_file() and f.suffix in [".csv", ".parquet", "parquet.gzip"]
+            ]
 
             parts = [
                 (
@@ -197,7 +206,7 @@ def upload(
 
                 blob_name = self._build_blob_name(filepath.name, m, part)
 
-                blob = self.bucket.blob(blob_name)
+                blob = self.bucket.blob(blob_name, chunk_size=chunk_size)
 
                 if not blob.exists() or if_exists == "replace":
 

diff --git a/python-package/basedosdados/upload/table.py b/python-package/basedosdados/upload/table.py
@@ -567,6 +567,7 @@ def create(
         columns_config_url_or_path=None,
         dataset_is_public=True,
         location=None,
+        chunk_size=None,
     ):
         """Creates BigQuery table at staging dataset.
 
@@ -626,6 +627,10 @@ def create(
             location (str): Optional. Location of dataset data.
                 List of possible region names locations: https://cloud.google.com/bigquery/docs/locations
 
+            chunk_size (int): Optional
+                The size of a chunk of data whenever iterating (in bytes).
+                This must be a multiple of 256 KB per the API specification.
+                If not specified, the chunk_size of the blob itself is used. If that is not specified, a default value of 40 MB is used.
         """
 
         if path is None:
@@ -651,7 +656,10 @@ def create(
         ):
 
             Storage(self.dataset_id, self.table_id, **self.main_vars).upload(
-                path, mode="staging", if_exists=if_storage_data_exists
+                path,
+                mode="staging",
+                if_exists=if_storage_data_exists,
+                chunk_size=chunk_size,
             )
 
         # Create Dataset if it doesn't exist
@@ -835,7 +843,14 @@ def delete(self, mode):
                 action="deleted",
             )
 
-    def append(self, filepath, partitions=None, if_exists="replace", **upload_args):
+    def append(
+        self,
+        filepath,
+        partitions=None,
+        if_exists="replace",
+        chunk_size=None,
+        **upload_args,
+    ):
         """Appends new data to existing BigQuery table.
 
         As long as the data has the same schema. It appends the data in the
@@ -854,6 +869,10 @@ def append(self, filepath, partitions=None, if_exists="replace", **upload_args):
                 * 'raise' : Raises Conflict exception
                 * 'replace' : Replace table
                 * 'pass' : Do nothing
+            chunk_size (int): Optional
+                The size of a chunk of data whenever iterating (in bytes).
+                This must be a multiple of 256 KB per the API specification.
+                If not specified, the chunk_size of the blob itself is used. If that is not specified, a default value of 40 MB is used.
         """
         if not self.table_exists("staging"):
             raise BaseDosDadosException(
@@ -865,6 +884,7 @@ def append(self, filepath, partitions=None, if_exists="replace", **upload_args):
                 mode="staging",
                 partitions=partitions,
                 if_exists=if_exists,
+                chunk_size=chunk_size,
                 **upload_args,
             )
             logger.success(

diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml
@@ -13,7 +13,7 @@ packages = [
 ]
 readme = "README.md"
 repository = "https://github.com/base-dos-dados/bases"
-version = "1.6.3-beta.2"
+version = "1.6.4"
 
 [tool.poetry.scripts]
 basedosdados = 'basedosdados.cli.cli:cli'
@@ -26,6 +26,7 @@ click = "8.0.3"
 google-cloud-bigquery = "2.30.1"
 google-cloud-bigquery-storage = "1.1.0"
 google-cloud-storage = "1.42.3"
+importlib-metadata = "^4.11.3"
 ipykernel = "5.3.4"
 jupyter = "^1.0.0"
 loguru = "^0.6.0"
@@ -44,7 +45,6 @@ python = ">=3.7.1,<3.11"
 toml = "^0.10.2"
 tomlkit = "0.7.0"
 tqdm = "4.50.2"
-importlib-metadata = "^4.11.3"
 
 [tool.black]
 # Use the more relaxed max line length permitted in PEP8.