[infra] python-v1.6.3 (#1166)

* [infra] remove no-bdm_table resources from list_dataset_tables function * [infra] fix list_datasets function * fix: table update for mode staging (#1168) * fix: config calls * feat: add config variable to readme * feat: pump poetry version * feat: fix readme * feat: pump actions version * fix: change import order * feat: pump poetry version Co-authored-by: Diego Oliveira <[email protected]>
basedosdados · Mar 23, 2022 · 711a9ef · 711a9ef
1 parent 9b17392
commit 711a9ef
Show file tree

Hide file tree

Showing 12 changed files with 130 additions and 78 deletions.
diff --git a/.github/workflows/data-check.yml b/.github/workflows/data-check.yml
@@ -59,7 +59,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install basedosdados==1.6.3b1 pyarrow pytest toml loguru
+          pip install basedosdados==1.6.3 pyarrow pytest toml 
       - name: Set up base dos dados environment
         shell: bash
         env:

diff --git a/.github/workflows/metadata-validate.yml b/.github/workflows/metadata-validate.yml
@@ -36,7 +36,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install basedosdados==1.6.3b1 toml loguru
+          pip install basedosdados==1.6.3 toml 
       - name: Set up base dos dados environment
         run: python .github/workflows/env-setup/env_setup.py
         shell: bash

diff --git a/.github/workflows/table-approve.yml b/.github/workflows/table-approve.yml
@@ -36,7 +36,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install basedosdados==1.6.3b1 toml loguru
+          pip install basedosdados==1.6.3 toml
       - name: Set up gcloud
         uses: google-github-actions/setup-gcloud@v0
         with:
@@ -116,7 +116,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install basedosdados==1.6.3b1 pyarrow pytest toml loguru
+          pip install basedosdados==1.6.3 pyarrow pytest toml
       - name: Set up basedosdados environment
         run: |
           cd .github/workflows/env-setup

diff --git a/README.md b/README.md
@@ -126,6 +126,23 @@ import basedosdados as bd
 bd.list_datasets()
 ```
 
+### Como definir paramêtros utilizando as configurações do pacote
+
+```py
+import basedosdados as bd
+
+# seta o billing_project_id global
+bd.config.billing_project_id =  '<billing-project-id>'
+
+query = """
+SELECT 
+    * 
+FROM `basedosdados.br_bd_diretorios_brasil.municipio`
+"""
+
+df = bd.read_sql(query=query)
+```
+
 Para saber mais, veja os [exemplos](https://github.com/basedosdados/analises/tree/main/artigos) ou a [documentação da API](https://basedosdados.github.io/mais/api_reference_python/)
 
 ## Usando em R

diff --git a/python-package/basedosdados/__init__.py b/python-package/basedosdados/__init__.py
@@ -1,9 +1,9 @@
-from basedosdados.constants import constants
 import sys
 import os
 
-sys.path.append(os.getcwd() + "/python-package")
+sys.path.append(f"{os.getcwd()}/python-package")
 
+from basedosdados.constants import constants, config
 from basedosdados.upload.dataset import Dataset
 from basedosdados.upload.storage import Storage
 from basedosdados.upload.table import Table
@@ -21,5 +21,5 @@
     get_dataset_description,
     get_table_columns,
     get_table_size,
-    search
-)
+    search,
+)
diff --git a/python-package/basedosdados/constants.py b/python-package/basedosdados/constants.py
@@ -9,6 +9,7 @@ class config:
     verbose: bool = True
     billing_project_id: str = None
     project_config_path: str = None
+    from_file: bool = False
 
 
 class constants(Enum):

diff --git a/python-package/basedosdados/download/download.py b/python-package/basedosdados/download/download.py
@@ -22,6 +22,16 @@
 from pandas_gbq.gbq import GenericGBQException
 
 
+def _set_config_variables(billing_project_id, from_file):
+
+    # standard billing_project_id configuration
+    billing_project_id = billing_project_id or config.billing_project_id
+    # standard from_file configuration
+    from_file = from_file or config.from_file
+
+    return billing_project_id, from_file
+
+
 def read_sql(
     query,
     billing_project_id=None,
@@ -50,9 +60,9 @@ def read_sql(
             Query result
     """
 
-    # standard billing_project_id configuration
-    if billing_project_id is None:
-        billing_project_id == config.billing_project_id
+    billing_project_id, from_file = _set_config_variables(
+        billing_project_id=billing_project_id, from_file=from_file
+    )
 
     try:
         # Set a two hours timeout
@@ -127,9 +137,9 @@ def read_table(
             Query result
     """
 
-    # standard billing_project_id configuration
-    if billing_project_id is None:
-        billing_project_id == config.billing_project_id
+    billing_project_id, from_file = _set_config_variables(
+        billing_project_id=billing_project_id, from_file=from_file
+    )
 
     if (dataset_id is not None) and (table_id is not None):
         query = f"""
@@ -205,15 +215,15 @@ def download(
         Exception: If either table_id, dataset_id or query are empty.
     """
 
+    billing_project_id, from_file = _set_config_variables(
+        billing_project_id=billing_project_id, from_file=from_file
+    )
+
     if (query is None) and ((table_id is None) or (dataset_id is None)):
         raise BaseDosDadosException(
             "Either table_id, dataset_id or query should be filled."
         )
 
-    # standard billing_project_id configuration
-    if billing_project_id is None:
-        billing_project_id == config.billing_project_id
-
     client = google_client(query_project_id, billing_project_id, from_file, reauth)
 
     # makes sure that savepath is a filepath and not a folder

diff --git a/python-package/basedosdados/download/metadata.py b/python-package/basedosdados/download/metadata.py
@@ -1,6 +1,8 @@
 from google.cloud import bigquery
 import pandas as pd
 import requests
+from collections import defaultdict
+import math
 
 from basedosdados.download.base import credentials
 
@@ -19,7 +21,23 @@ def _safe_fetch(url:str):
     except requests.exceptions.ConnectionError as errc:
         print ("Error Connecting:",errc)
     except requests.exceptions.Timeout as errt:
-        print ("Timeout Error:",errt)   
+        print ("Timeout Error:",errt)  
+
+def _dict_from_page(json_response):
+    """
+    Generate a dict from BD's API response with dataset_id and description as keys
+    """
+    temp_dict = {
+        "dataset_id": [
+            dataset["name"] for dataset in json_response["result"]["datasets"]
+        ],
+        "description": [
+            dataset["notes"] if "notes" in dataset.keys() else None
+            for dataset in json_response["result"]["datasets"]
+        ],
+    }
+
+    return temp_dict
 
 def _fix_size(s, step=80):
 
@@ -83,16 +101,12 @@ def _handle_output(verbose, output_type, df, col_name=None):
 
     return None
 
-def list_datasets(query, limit=10, with_description=False, verbose=True):
+def list_datasets(with_description=False, verbose=True):
     """
     This function uses `bd_dataset_search` website API
     enpoint to retrieve a list of available datasets.
 
     Args:
-        query (str):
-            String to search in datasets' metadata.
-        limit (int):
-            Field to limit the number of results
         with_description (bool): Optional
             If True, fetch short dataset description for each dataset.
         verbose (bool): Optional.
@@ -101,34 +115,46 @@ def list_datasets(query, limit=10, with_description=False, verbose=True):
     Returns:
         list | stdout
     """
-
-    url = f"https://basedosdados.org/api/3/action/bd_dataset_search?q={query}&page_size={limit}&resource_type=bdm_table"
-
+    # first request is made separately since we need to now the number of pages before the iteration
+    page_size = 100  # this function will only made more than one requisition if there are more than 100 datasets in the API response #pylint: disable=C0301
+    url = f"https://basedosdados.org/api/3/action/bd_dataset_search?q=&resource_type=bdm_table&page=1&page_size={page_size}"  # pylint: disable=C0301
     response = _safe_fetch(url)
-
     json_response = response.json()
-
-    # this dict has all information we need to output the function
-    dataset_dict = {
-        "dataset_id": [
-            dataset["name"] for dataset in json_response["result"]["datasets"]
-        ],
-        "description": [
-            dataset["notes"] if "notes" in dataset.keys() else None
-            for dataset in json_response["result"]["datasets"]
-        ],
-    }
-
-    # select desired output using dataset_id info. Note that the output is either a standardized string or a list
-    if verbose & (with_description == False):
+    n_datasets = json_response["result"]["count"]
+    n_pages = math.ceil(n_datasets / page_size)
+    temp_dict = _dict_from_page(json_response)
+
+    temp_dicts = [temp_dict]
+    for page in range(2, n_pages + 1):
+        url = f"https://basedosdados.org/api/3/action/bd_dataset_search?q=&resource_type=bdm_table&page={page}&page_size={page_size}"  # pylint: disable=C0301
+        response = _safe_fetch(url)
+        json_response = response.json()
+        temp_dict = _dict_from_page(json_response)
+        temp_dicts.append(temp_dict)
+
+    dataset_dict = defaultdict(list)
+
+    for d in temp_dicts:  # pylint: disable=C0103
+        for key, value in d.items():
+            dataset_dict[key].append(value)
+
+    # flat inner lists
+    dataset_dict["dataset_id"] = [
+        item for sublist in dataset_dict["dataset_id"] for item in sublist
+    ]  # pylint: disable=C0301
+    dataset_dict["description"] = [
+        item for sublist in dataset_dict["description"] for item in sublist
+    ]  # pylint: disable=C0301
+    # select desired output using dataset_id info. Note that the output is either a standardized string or a list #pylint: disable=C0301
+    if verbose & (with_description is False):
         return _print_output(pd.DataFrame.from_dict(dataset_dict)[["dataset_id"]])
     elif verbose & with_description:
         return _print_output(
             pd.DataFrame.from_dict(dataset_dict)[["dataset_id", "description"]]
         )
-    elif (verbose == False) & (with_description == False):
+    elif (verbose is False) & (with_description is False):
         return dataset_dict["dataset_id"]
-    elif (verbose == False) & with_description:
+    elif (verbose is False) & with_description:
         return [
             {
                 "dataset_id": dataset_dict["dataset_id"][k],
@@ -137,7 +163,6 @@ def list_datasets(query, limit=10, with_description=False, verbose=True):
             for k in range(len(dataset_dict["dataset_id"]))
         ]
 
-
 def list_dataset_tables(
     dataset_id,
     with_description=False,
@@ -172,11 +197,11 @@ def list_dataset_tables(
     # this dict has all information need to output the function
     table_dict = {
         "table_id": [
-            dataset["resources"][k]["name"] for k in range(len(dataset["resources"]))
+            dataset["resources"][k]["name"] for k in range(len(dataset["resources"])) if dataset['resources'][k]['resource_type']=='bdm_table'
         ],
         "description": [
             dataset["resources"][k]["description"]
-            for k in range(len(dataset["resources"]))
+            for k in range(len(dataset["resources"])) if dataset['resources'][k]['resource_type']=='bdm_table'
         ],
     }
     # select desired output using table_id info. Note that the output is either a standardized string or a list

diff --git a/python-package/basedosdados/upload/base.py b/python-package/basedosdados/upload/base.py
@@ -27,10 +27,12 @@ def __init__(
         metadata_path=None,
         overwrite_cli_config=False,
     ):
-
         # standard config_path configuration
-        if config_path is None:
-            config_path == config.config_path
+        config_path = (
+            config.project_config_path
+            if config.project_config_path is not None
+            else config_path
+        )
 
         self.config_path = Path.home() / config_path
         self._init_config(force=overwrite_cli_config)

diff --git a/python-package/basedosdados/upload/table.py b/python-package/basedosdados/upload/table.py
@@ -743,15 +743,13 @@ def create(
 
         self.client["bigquery_staging"].create_table(table)
 
-
         logger.success(
             "{object} {object_id} was {action}!",
             object_id=self.table_id,
             object="Table",
             action="created",
         )
 
-
     def update(self, mode="all", not_found_ok=True):
         """Updates BigQuery schema and description.
         Args:
@@ -787,12 +785,11 @@ def update(self, mode="all", not_found_ok=True):
                 encoding="utf-8",
             ).write(table.description)
 
-            if m == "staging":
-                table.schema = self._load_schema(m)
+            # when mode is staging the table schema already exists
+            table.schema = self._load_schema(m)
+            fields = ["description", "schema"] if m == "prod" else ["description"]
+            self.client[f"bigquery_{m}"].update_table(table, fields=fields)
 
-                self.client[f"bigquery_{m}"].update_table(
-                    table, fields=["description", "schema"]
-                )
         logger.success(
             " {object} {object_id} was {action}!",
             object_id=self.table_id,

diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml
@@ -13,7 +13,7 @@ packages = [
 ]
 readme = "README.md"
 repository = "https://github.com/base-dos-dados/bases"
-version = "1.6.2"
+version = "1.6.3-beta.2"
 
 [tool.poetry.scripts]
 basedosdados = 'basedosdados.cli.cli:cli'
@@ -25,17 +25,17 @@ click = "8.0.3"
 google-cloud-bigquery = "2.30.1"
 google-cloud-bigquery-storage = "1.1.0"
 google-cloud-storage = "1.42.3"
+loguru = "^0.6.0"
+pandas = "1.2.4"
 pandas-gbq = "0.13.2"
+pandavro = "^1.6.0"
 pyaml = "20.4.0"
+pyarrow = "6.0.0"
 python = ">=3.7.1,<3.11"
+'ruamel.yaml' = "0.17.10"
+toml = "^0.10.2"
 tomlkit = "0.7.0"
 tqdm = "4.50.2"
-pandas= "1.2.4"
-'ruamel.yaml'= "0.17.10"
-pyarrow = "6.0.0"
-pandavro = "^1.6.0"
-loguru = "^0.6.0"
-toml = "^0.10.2"
 
 [tool.poetry.dev-dependencies]
 Jinja2 = "3.0.3"