Skip to content

Commit

Permalink
[infra] Python 1.6.4 (#1188)
Browse files Browse the repository at this point in the history
* feat: refactor update columns function in table.py

* expose chunk_size parameter

* fix: make staging data acessible

* fix: make staging data acessible

* fix: add parquet to storage options

* pump version

* fix: change bd_bdm_table_schema to new format

* feat: test mergfy and pylint

* fix: change spatial_coverage_tree to its own endpoint

* feat: publish python-1.6.4

* Update table-approve.yml

Co-authored-by: hellcassius <[email protected]>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored Jul 6, 2022
1 parent 00fd76f commit 7eaa428
Show file tree
Hide file tree
Showing 9 changed files with 105 additions and 41 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/data-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install basedosdados==1.6.4b5 pyarrow pytest toml
pip install basedosdados==1.6.4 pyarrow pytest toml
- name: Set up base dos dados environment
shell: bash
env:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/metadata-validate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install basedosdados==1.6.4b5 toml
pip install basedosdados==1.6.4 toml
- name: Set up base dos dados environment
run: python .github/workflows/env-setup/env_setup.py
shell: bash
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/table-approve.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install basedosdados==1.6.4b5 toml
pip install basedosdados==1.6.4 toml
- name: Set up gcloud
uses: google-github-actions/setup-gcloud@v0
with:
Expand Down Expand Up @@ -116,7 +116,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install basedosdados==1.6.4b5 pyarrow pytest toml
pip install basedosdados==1.6.4 pyarrow pytest toml
- name: Set up basedosdados environment
run: |
cd .github/workflows/env-setup
Expand Down
36 changes: 32 additions & 4 deletions python-package/basedosdados/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,11 @@ def init_table(
default=None,
help="Location of dataset data. List of possible region names locations: https://cloud.google.com/bigquery/docs/locations",
)
@click.option(
"--chunk_size",
default=None,
help="The size of a chunk of data whenever iterating (in bytes). This must be a multiple of 256 KB per the API specification.",
)
@click.pass_context
def create_table(
ctx,
Expand All @@ -295,6 +300,7 @@ def create_table(
columns_config_url_or_path,
dataset_is_public,
location,
chunk_size,
):

Table(table_id=table_id, dataset_id=dataset_id, **ctx.obj).create(
Expand All @@ -308,6 +314,7 @@ def create_table(
columns_config_url_or_path=columns_config_url_or_path,
dataset_is_public=dataset_is_public,
location=location,
chunk_size=chunk_size,
)

click.echo(
Expand Down Expand Up @@ -428,11 +435,21 @@ def delete_table(ctx, dataset_id, table_id, mode):
default="raise",
help="[raise|replace|pass] if file alread exists",
)
@click.option(
"--chunk_size",
default=None,
help="The size of a chunk of data whenever iterating (in bytes). This must be a multiple of 256 KB per the API specification.",
)
@click.pass_context
def upload_table(ctx, dataset_id, table_id, filepath, partitions, if_exists):
def upload_table(
ctx, dataset_id, table_id, filepath, partitions, if_exists, chunk_size
):

blob_name = Table(table_id=table_id, dataset_id=dataset_id, **ctx.obj).append(
filepath=filepath, partitions=partitions, if_exists=if_exists
filepath=filepath,
partitions=partitions,
if_exists=if_exists,
chunk_size=chunk_size,
)

click.echo(
Expand Down Expand Up @@ -493,12 +510,23 @@ def init_storage(ctx, bucket_name, replace, very_sure):
default="raise",
help="[raise|replace|pass] if file alread exists",
)
@click.option(
"--chunk_size",
default=None,
help="The size of a chunk of data whenever iterating (in bytes). This must be a multiple of 256 KB per the API specification.",
)
@click.pass_context
def upload_storage(ctx, dataset_id, table_id, filepath, mode, partitions, if_exists):
def upload_storage(
ctx, dataset_id, table_id, filepath, mode, partitions, if_exists, chunk_size
):

ctx.obj.pop("bucket_name")
blob_name = Storage(dataset_id, table_id, **ctx.obj).upload(
filepath=filepath, mode=mode, partitions=partitions, if_exists=if_exists
filepath=filepath,
mode=mode,
partitions=partitions,
if_exists=if_exists,
chunk_size=chunk_size,
)

click.echo(
Expand Down
53 changes: 32 additions & 21 deletions python-package/basedosdados/upload/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,28 +120,39 @@ def publicize(self, mode="all", dataset_is_public=True):
dataset = m["client"].get_dataset(m["id"])
entries = dataset.access_entries
# TODO https://github.com/basedosdados/mais/pull/1020
if dataset_is_public and "staging" not in dataset.dataset_id:
entries.extend(
[
bigquery.AccessEntry(
role="roles/bigquery.dataViewer",
entity_type="iamMember",
entity_id="allUsers",
),
bigquery.AccessEntry(
role="roles/bigquery.metadataViewer",
entity_type="iamMember",
entity_id="allUsers",
),
bigquery.AccessEntry(
role="roles/bigquery.user",
entity_type="iamMember",
entity_id="allUsers",
),
]
)
# TODO if staging dataset is private, the prod view can't acess it: if dataset_is_public and "staging" not in dataset.dataset_id:
if dataset_is_public:
if "staging" not in dataset.dataset_id:
entries.extend(
[
bigquery.AccessEntry(
role="roles/bigquery.dataViewer",
entity_type="iamMember",
entity_id="allUsers",
),
bigquery.AccessEntry(
role="roles/bigquery.metadataViewer",
entity_type="iamMember",
entity_id="allUsers",
),
bigquery.AccessEntry(
role="roles/bigquery.user",
entity_type="iamMember",
entity_id="allUsers",
),
]
)
else:
entries.extend(
[
bigquery.AccessEntry(
role="roles/bigquery.dataViewer",
entity_type="iamMember",
entity_id="allUsers",
),
]
)
dataset.access_entries = entries

m["client"].update_dataset(dataset, ["access_entries"])
logger.success(
" {object} {object_id}_{mode} was {action}!",
Expand Down
8 changes: 2 additions & 6 deletions python-package/basedosdados/upload/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,14 +212,10 @@ def metadata_schema(self) -> dict:

if self.table_id:
table_url = f"{self.CKAN_URL}/api/3/action/bd_bdm_table_schema"
table_schema = requests.get(table_url).json().get("result")

return table_schema
return requests.get(table_url).json().get("result")

dataset_url = f"{self.CKAN_URL}/api/3/action/bd_dataset_schema"
dataset_schema = requests.get(dataset_url).json().get("result")

return dataset_schema
return requests.get(dataset_url).json().get("result")

def exists_in_ckan(self) -> bool:
"""Check if Metadata object refers to an existing CKAN package or reso
Expand Down
13 changes: 11 additions & 2 deletions python-package/basedosdados/upload/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def upload(
mode="all",
partitions=None,
if_exists="raise",
chunk_size=None,
**upload_args,
):
"""Upload to storage at `<bucket_name>/<mode>/<dataset_id>/<table_id>`. You can:
Expand Down Expand Up @@ -158,6 +159,10 @@ def upload(
* 'raise' : Raises Conflict exception
* 'replace' : Replace table
* 'pass' : Do nothing
chunk_size (int): Optional
The size of a chunk of data whenever iterating (in bytes).
This must be a multiple of 256 KB per the API specification.
If not specified, the chunk_size of the blob itself is used. If that is not specified, a default value of 40 MB is used.
upload_args ():
Extra arguments accepted by [`google.cloud.storage.blob.Blob.upload_from_file`](https://googleapis.dev/python/storage/latest/blobs.html?highlight=upload_from_filename#google.cloud.storage.blob.Blob.upload_from_filename)
Expand All @@ -169,7 +174,11 @@ def upload(
path = Path(path)

if path.is_dir():
paths = [f for f in path.glob("**/*") if f.is_file() and f.suffix == ".csv"]
paths = [
f
for f in path.glob("**/*")
if f.is_file() and f.suffix in [".csv", ".parquet", "parquet.gzip"]
]

parts = [
(
Expand Down Expand Up @@ -197,7 +206,7 @@ def upload(

blob_name = self._build_blob_name(filepath.name, m, part)

blob = self.bucket.blob(blob_name)
blob = self.bucket.blob(blob_name, chunk_size=chunk_size)

if not blob.exists() or if_exists == "replace":

Expand Down
24 changes: 22 additions & 2 deletions python-package/basedosdados/upload/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,7 @@ def create(
columns_config_url_or_path=None,
dataset_is_public=True,
location=None,
chunk_size=None,
):
"""Creates BigQuery table at staging dataset.
Expand Down Expand Up @@ -626,6 +627,10 @@ def create(
location (str): Optional. Location of dataset data.
List of possible region names locations: https://cloud.google.com/bigquery/docs/locations
chunk_size (int): Optional
The size of a chunk of data whenever iterating (in bytes).
This must be a multiple of 256 KB per the API specification.
If not specified, the chunk_size of the blob itself is used. If that is not specified, a default value of 40 MB is used.
"""

if path is None:
Expand All @@ -651,7 +656,10 @@ def create(
):

Storage(self.dataset_id, self.table_id, **self.main_vars).upload(
path, mode="staging", if_exists=if_storage_data_exists
path,
mode="staging",
if_exists=if_storage_data_exists,
chunk_size=chunk_size,
)

# Create Dataset if it doesn't exist
Expand Down Expand Up @@ -835,7 +843,14 @@ def delete(self, mode):
action="deleted",
)

def append(self, filepath, partitions=None, if_exists="replace", **upload_args):
def append(
self,
filepath,
partitions=None,
if_exists="replace",
chunk_size=None,
**upload_args,
):
"""Appends new data to existing BigQuery table.
As long as the data has the same schema. It appends the data in the
Expand All @@ -854,6 +869,10 @@ def append(self, filepath, partitions=None, if_exists="replace", **upload_args):
* 'raise' : Raises Conflict exception
* 'replace' : Replace table
* 'pass' : Do nothing
chunk_size (int): Optional
The size of a chunk of data whenever iterating (in bytes).
This must be a multiple of 256 KB per the API specification.
If not specified, the chunk_size of the blob itself is used. If that is not specified, a default value of 40 MB is used.
"""
if not self.table_exists("staging"):
raise BaseDosDadosException(
Expand All @@ -865,6 +884,7 @@ def append(self, filepath, partitions=None, if_exists="replace", **upload_args):
mode="staging",
partitions=partitions,
if_exists=if_exists,
chunk_size=chunk_size,
**upload_args,
)
logger.success(
Expand Down
4 changes: 2 additions & 2 deletions python-package/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ packages = [
]
readme = "README.md"
repository = "https://github.com/base-dos-dados/bases"
version = "1.6.3-beta.2"
version = "1.6.4"

[tool.poetry.scripts]
basedosdados = 'basedosdados.cli.cli:cli'
Expand All @@ -26,6 +26,7 @@ click = "8.0.3"
google-cloud-bigquery = "2.30.1"
google-cloud-bigquery-storage = "1.1.0"
google-cloud-storage = "1.42.3"
importlib-metadata = "^4.11.3"
ipykernel = "5.3.4"
jupyter = "^1.0.0"
loguru = "^0.6.0"
Expand All @@ -44,7 +45,6 @@ python = ">=3.7.1,<3.11"
toml = "^0.10.2"
tomlkit = "0.7.0"
tqdm = "4.50.2"
importlib-metadata = "^4.11.3"

[tool.black]
# Use the more relaxed max line length permitted in PEP8.
Expand Down

0 comments on commit 7eaa428

Please sign in to comment.