Skip to content

Commit

Permalink
[infra] python-v1.6.3 (#1166)
Browse files Browse the repository at this point in the history
* [infra] remove no-bdm_table resources from list_dataset_tables function

* [infra] fix list_datasets function

* fix: table update for mode staging (#1168)

* fix: config calls

* feat: add config variable to readme

* feat: pump poetry version

* feat: fix readme

* feat: pump actions version

* fix: change import order

* feat: pump poetry version

Co-authored-by: Diego Oliveira <[email protected]>
  • Loading branch information
lucascr91 and d116626 authored Mar 23, 2022
1 parent 9b17392 commit 711a9ef
Show file tree
Hide file tree
Showing 12 changed files with 130 additions and 78 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/data-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install basedosdados==1.6.3b1 pyarrow pytest toml loguru
pip install basedosdados==1.6.3 pyarrow pytest toml
- name: Set up base dos dados environment
shell: bash
env:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/metadata-validate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install basedosdados==1.6.3b1 toml loguru
pip install basedosdados==1.6.3 toml
- name: Set up base dos dados environment
run: python .github/workflows/env-setup/env_setup.py
shell: bash
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/table-approve.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install basedosdados==1.6.3b1 toml loguru
pip install basedosdados==1.6.3 toml
- name: Set up gcloud
uses: google-github-actions/setup-gcloud@v0
with:
Expand Down Expand Up @@ -116,7 +116,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install basedosdados==1.6.3b1 pyarrow pytest toml loguru
pip install basedosdados==1.6.3 pyarrow pytest toml
- name: Set up basedosdados environment
run: |
cd .github/workflows/env-setup
Expand Down
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,23 @@ import basedosdados as bd
bd.list_datasets()
```

### Como definir paramêtros utilizando as configurações do pacote

```py
import basedosdados as bd

# seta o billing_project_id global
bd.config.billing_project_id = '<billing-project-id>'

query = """
SELECT
*
FROM `basedosdados.br_bd_diretorios_brasil.municipio`
"""

df = bd.read_sql(query=query)
```

Para saber mais, veja os [exemplos](https://github.com/basedosdados/analises/tree/main/artigos) ou a [documentação da API](https://basedosdados.github.io/mais/api_reference_python/)

## Usando em R
Expand Down
8 changes: 4 additions & 4 deletions python-package/basedosdados/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from basedosdados.constants import constants
import sys
import os

sys.path.append(os.getcwd() + "/python-package")
sys.path.append(f"{os.getcwd()}/python-package")

from basedosdados.constants import constants, config
from basedosdados.upload.dataset import Dataset
from basedosdados.upload.storage import Storage
from basedosdados.upload.table import Table
Expand All @@ -21,5 +21,5 @@
get_dataset_description,
get_table_columns,
get_table_size,
search
)
search,
)
1 change: 1 addition & 0 deletions python-package/basedosdados/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ class config:
verbose: bool = True
billing_project_id: str = None
project_config_path: str = None
from_file: bool = False


class constants(Enum):
Expand Down
30 changes: 20 additions & 10 deletions python-package/basedosdados/download/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,16 @@
from pandas_gbq.gbq import GenericGBQException


def _set_config_variables(billing_project_id, from_file):

# standard billing_project_id configuration
billing_project_id = billing_project_id or config.billing_project_id
# standard from_file configuration
from_file = from_file or config.from_file

return billing_project_id, from_file


def read_sql(
query,
billing_project_id=None,
Expand Down Expand Up @@ -50,9 +60,9 @@ def read_sql(
Query result
"""

# standard billing_project_id configuration
if billing_project_id is None:
billing_project_id == config.billing_project_id
billing_project_id, from_file = _set_config_variables(
billing_project_id=billing_project_id, from_file=from_file
)

try:
# Set a two hours timeout
Expand Down Expand Up @@ -127,9 +137,9 @@ def read_table(
Query result
"""

# standard billing_project_id configuration
if billing_project_id is None:
billing_project_id == config.billing_project_id
billing_project_id, from_file = _set_config_variables(
billing_project_id=billing_project_id, from_file=from_file
)

if (dataset_id is not None) and (table_id is not None):
query = f"""
Expand Down Expand Up @@ -205,15 +215,15 @@ def download(
Exception: If either table_id, dataset_id or query are empty.
"""

billing_project_id, from_file = _set_config_variables(
billing_project_id=billing_project_id, from_file=from_file
)

if (query is None) and ((table_id is None) or (dataset_id is None)):
raise BaseDosDadosException(
"Either table_id, dataset_id or query should be filled."
)

# standard billing_project_id configuration
if billing_project_id is None:
billing_project_id == config.billing_project_id

client = google_client(query_project_id, billing_project_id, from_file, reauth)

# makes sure that savepath is a filepath and not a folder
Expand Down
83 changes: 54 additions & 29 deletions python-package/basedosdados/download/metadata.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from google.cloud import bigquery
import pandas as pd
import requests
from collections import defaultdict
import math

from basedosdados.download.base import credentials

Expand All @@ -19,7 +21,23 @@ def _safe_fetch(url:str):
except requests.exceptions.ConnectionError as errc:
print ("Error Connecting:",errc)
except requests.exceptions.Timeout as errt:
print ("Timeout Error:",errt)
print ("Timeout Error:",errt)

def _dict_from_page(json_response):
"""
Generate a dict from BD's API response with dataset_id and description as keys
"""
temp_dict = {
"dataset_id": [
dataset["name"] for dataset in json_response["result"]["datasets"]
],
"description": [
dataset["notes"] if "notes" in dataset.keys() else None
for dataset in json_response["result"]["datasets"]
],
}

return temp_dict

def _fix_size(s, step=80):

Expand Down Expand Up @@ -83,16 +101,12 @@ def _handle_output(verbose, output_type, df, col_name=None):

return None

def list_datasets(query, limit=10, with_description=False, verbose=True):
def list_datasets(with_description=False, verbose=True):
"""
This function uses `bd_dataset_search` website API
enpoint to retrieve a list of available datasets.
Args:
query (str):
String to search in datasets' metadata.
limit (int):
Field to limit the number of results
with_description (bool): Optional
If True, fetch short dataset description for each dataset.
verbose (bool): Optional.
Expand All @@ -101,34 +115,46 @@ def list_datasets(query, limit=10, with_description=False, verbose=True):
Returns:
list | stdout
"""

url = f"https://basedosdados.org/api/3/action/bd_dataset_search?q={query}&page_size={limit}&resource_type=bdm_table"

# first request is made separately since we need to now the number of pages before the iteration
page_size = 100 # this function will only made more than one requisition if there are more than 100 datasets in the API response #pylint: disable=C0301
url = f"https://basedosdados.org/api/3/action/bd_dataset_search?q=&resource_type=bdm_table&page=1&page_size={page_size}" # pylint: disable=C0301
response = _safe_fetch(url)

json_response = response.json()

# this dict has all information we need to output the function
dataset_dict = {
"dataset_id": [
dataset["name"] for dataset in json_response["result"]["datasets"]
],
"description": [
dataset["notes"] if "notes" in dataset.keys() else None
for dataset in json_response["result"]["datasets"]
],
}

# select desired output using dataset_id info. Note that the output is either a standardized string or a list
if verbose & (with_description == False):
n_datasets = json_response["result"]["count"]
n_pages = math.ceil(n_datasets / page_size)
temp_dict = _dict_from_page(json_response)

temp_dicts = [temp_dict]
for page in range(2, n_pages + 1):
url = f"https://basedosdados.org/api/3/action/bd_dataset_search?q=&resource_type=bdm_table&page={page}&page_size={page_size}" # pylint: disable=C0301
response = _safe_fetch(url)
json_response = response.json()
temp_dict = _dict_from_page(json_response)
temp_dicts.append(temp_dict)

dataset_dict = defaultdict(list)

for d in temp_dicts: # pylint: disable=C0103
for key, value in d.items():
dataset_dict[key].append(value)

# flat inner lists
dataset_dict["dataset_id"] = [
item for sublist in dataset_dict["dataset_id"] for item in sublist
] # pylint: disable=C0301
dataset_dict["description"] = [
item for sublist in dataset_dict["description"] for item in sublist
] # pylint: disable=C0301
# select desired output using dataset_id info. Note that the output is either a standardized string or a list #pylint: disable=C0301
if verbose & (with_description is False):
return _print_output(pd.DataFrame.from_dict(dataset_dict)[["dataset_id"]])
elif verbose & with_description:
return _print_output(
pd.DataFrame.from_dict(dataset_dict)[["dataset_id", "description"]]
)
elif (verbose == False) & (with_description == False):
elif (verbose is False) & (with_description is False):
return dataset_dict["dataset_id"]
elif (verbose == False) & with_description:
elif (verbose is False) & with_description:
return [
{
"dataset_id": dataset_dict["dataset_id"][k],
Expand All @@ -137,7 +163,6 @@ def list_datasets(query, limit=10, with_description=False, verbose=True):
for k in range(len(dataset_dict["dataset_id"]))
]


def list_dataset_tables(
dataset_id,
with_description=False,
Expand Down Expand Up @@ -172,11 +197,11 @@ def list_dataset_tables(
# this dict has all information need to output the function
table_dict = {
"table_id": [
dataset["resources"][k]["name"] for k in range(len(dataset["resources"]))
dataset["resources"][k]["name"] for k in range(len(dataset["resources"])) if dataset['resources'][k]['resource_type']=='bdm_table'
],
"description": [
dataset["resources"][k]["description"]
for k in range(len(dataset["resources"]))
for k in range(len(dataset["resources"])) if dataset['resources'][k]['resource_type']=='bdm_table'
],
}
# select desired output using table_id info. Note that the output is either a standardized string or a list
Expand Down
8 changes: 5 additions & 3 deletions python-package/basedosdados/upload/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,12 @@ def __init__(
metadata_path=None,
overwrite_cli_config=False,
):

# standard config_path configuration
if config_path is None:
config_path == config.config_path
config_path = (
config.project_config_path
if config.project_config_path is not None
else config_path
)

self.config_path = Path.home() / config_path
self._init_config(force=overwrite_cli_config)
Expand Down
11 changes: 4 additions & 7 deletions python-package/basedosdados/upload/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,15 +743,13 @@ def create(

self.client["bigquery_staging"].create_table(table)


logger.success(
"{object} {object_id} was {action}!",
object_id=self.table_id,
object="Table",
action="created",
)


def update(self, mode="all", not_found_ok=True):
"""Updates BigQuery schema and description.
Args:
Expand Down Expand Up @@ -787,12 +785,11 @@ def update(self, mode="all", not_found_ok=True):
encoding="utf-8",
).write(table.description)

if m == "staging":
table.schema = self._load_schema(m)
# when mode is staging the table schema already exists
table.schema = self._load_schema(m)
fields = ["description", "schema"] if m == "prod" else ["description"]
self.client[f"bigquery_{m}"].update_table(table, fields=fields)

self.client[f"bigquery_{m}"].update_table(
table, fields=["description", "schema"]
)
logger.success(
" {object} {object_id} was {action}!",
object_id=self.table_id,
Expand Down
14 changes: 7 additions & 7 deletions python-package/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ packages = [
]
readme = "README.md"
repository = "https://github.com/base-dos-dados/bases"
version = "1.6.2"
version = "1.6.3-beta.2"

[tool.poetry.scripts]
basedosdados = 'basedosdados.cli.cli:cli'
Expand All @@ -25,17 +25,17 @@ click = "8.0.3"
google-cloud-bigquery = "2.30.1"
google-cloud-bigquery-storage = "1.1.0"
google-cloud-storage = "1.42.3"
loguru = "^0.6.0"
pandas = "1.2.4"
pandas-gbq = "0.13.2"
pandavro = "^1.6.0"
pyaml = "20.4.0"
pyarrow = "6.0.0"
python = ">=3.7.1,<3.11"
'ruamel.yaml' = "0.17.10"
toml = "^0.10.2"
tomlkit = "0.7.0"
tqdm = "4.50.2"
pandas= "1.2.4"
'ruamel.yaml'= "0.17.10"
pyarrow = "6.0.0"
pandavro = "^1.6.0"
loguru = "^0.6.0"
toml = "^0.10.2"

[tool.poetry.dev-dependencies]
Jinja2 = "3.0.3"
Expand Down
Loading

0 comments on commit 711a9ef

Please sign in to comment.