Skip to content

Commit

Permalink
[dados] metadata (#1340)
Browse files Browse the repository at this point in the history
* feat: init metadata

* preenchimento d

* update dataset_config

* update table_config columns

* update

* update br_bd_metadados

* final update

---------

Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
Co-authored-by: tricktx <[email protected]>
  • Loading branch information
3 people authored Mar 15, 2023
1 parent 4e2cceb commit 0ca8873
Show file tree
Hide file tree
Showing 36 changed files with 2,483 additions and 0 deletions.
34 changes: 34 additions & 0 deletions bases/br_bd_metadados/columns/publish.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
Query para publicar a tabela.
Esse é o lugar para:
- modificar nomes, ordem e tipos de colunas
- dar join com outras tabelas
- criar colunas extras (e.g. logs, proporções, etc.)
Qualquer coluna definida aqui deve também existir em `table_config.yaml`.
# Além disso, sinta-se à vontade para alterar alguns nomes obscuros
# para algo um pouco mais explícito.
TIPOS:
- Para modificar tipos de colunas, basta substituir STRING por outro tipo válido.
- Exemplo: `SAFE_CAST(column_name AS NUMERIC) column_name`
- Mais detalhes: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
*/

CREATE VIEW basedosdados-dev.br_bd_metadados.columns AS
SELECT
SAFE_CAST(table_id AS STRING) table_id,
SAFE_CAST(name AS STRING) name,
SAFE_CAST(bigquery_type AS STRING) bigquery_type,
SAFE_CAST(description AS STRING) description,
SAFE_CAST(temporal_coverage AS STRING) temporal_coverage,
SAFE_CAST(covered_by_dictionary AS STRING) covered_by_dictionary,
SAFE_CAST(directory_column AS STRING) directory_column,
SAFE_CAST(measurement_unit AS STRING) measurement_unit,
SAFE_CAST(has_sensitive_data AS STRING) has_sensitive_data,
SAFE_CAST(observations AS STRING) observations,
SAFE_CAST(is_in_staging AS STRING) is_in_staging,
SAFE_CAST(is_partition AS STRING) is_partition
FROM basedosdados-dev.br_bd_metadados_staging.columns AS t
1 change: 1 addition & 0 deletions bases/br_bd_metadados/columns/schema-prod.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[{"name": "table_id", "bigquery_type": "string", "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "observations": null, "is_in_staging": null, "is_partition": null, "type": "STRING", "mode": "NULLABLE"}, {"name": "name", "bigquery_type": "string", "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "observations": null, "is_in_staging": null, "is_partition": null, "type": "STRING", "mode": "NULLABLE"}, {"name": "bigquery_type", "bigquery_type": "string", "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "observations": null, "is_in_staging": null, "is_partition": null, "type": "STRING", "mode": "NULLABLE"}, {"name": "description", "bigquery_type": "string", "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "observations": null, "is_in_staging": null, "is_partition": null, "type": "STRING", "mode": "NULLABLE"}, {"name": "temporal_coverage", "bigquery_type": "string", "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "observations": null, "is_in_staging": null, "is_partition": null, "type": "STRING", "mode": "NULLABLE"}, {"name": "covered_by_dictionary", "bigquery_type": "string", "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "observations": null, "is_in_staging": null, "is_partition": null, "type": "STRING", "mode": "NULLABLE"}, {"name": "directory_column", "bigquery_type": "string", "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "observations": null, "is_in_staging": null, "is_partition": null, "type": "STRING", "mode": "NULLABLE"}, {"name": "measurement_unit", "bigquery_type": "string", "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "observations": null, "is_in_staging": null, "is_partition": null, "type": "STRING", "mode": "NULLABLE"}, {"name": "has_sensitive_data", "bigquery_type": "string", "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "observations": null, "is_in_staging": null, "is_partition": null, "type": "STRING", "mode": "NULLABLE"}, {"name": "observations", "bigquery_type": "string", "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "observations": null, "is_in_staging": null, "is_partition": null, "type": "STRING", "mode": "NULLABLE"}, {"name": "is_in_staging", "bigquery_type": "string", "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "observations": null, "is_in_staging": null, "is_partition": null, "type": "STRING", "mode": "NULLABLE"}, {"name": "is_partition", "bigquery_type": "string", "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "observations": null, "is_in_staging": null, "is_partition": null, "type": "STRING", "mode": "NULLABLE"}]
1 change: 1 addition & 0 deletions bases/br_bd_metadados/columns/schema-staging.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[{"name": "table_id", "bigquery_type": "string", "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "observations": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "name", "bigquery_type": "string", "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "observations": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "bigquery_type", "bigquery_type": "string", "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "observations": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "description", "bigquery_type": "string", "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "observations": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "temporal_coverage", "bigquery_type": "string", "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "observations": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "covered_by_dictionary", "bigquery_type": "string", "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "observations": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "directory_column", "bigquery_type": "string", "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "observations": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "measurement_unit", "bigquery_type": "string", "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "observations": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "has_sensitive_data", "bigquery_type": "string", "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "observations": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "observations", "bigquery_type": "string", "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "observations": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "is_in_staging", "bigquery_type": "string", "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "observations": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "is_partition", "bigquery_type": "string", "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "observations": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}]
291 changes: 291 additions & 0 deletions bases/br_bd_metadados/columns/table_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,291 @@
# Igual ao dataset.name mas como lower case.
# Exemplos: br_ibge_populacao, br_inep_censo_escolar
dataset_id: br_bd_metadados

table_id: columns

# Título da tabela.
title: Colunas na BD+

# Descreva a tabela. Essas são as primeiras frases que um usuário vai ver.
# Você não precisa ser muito conciso. Sinta-se a vontade para dar exemplos de
# como usar os dados.
# Se souber, liste também aplicações: pesquisa, apps, etc. que usem os dados.,
description: Colunas na BD+

# As máximas unidades espaciais que a tabela cobre.
# Exemplo:
# - sa.br
spatial_coverage:
- world

# Anos cobertos pela tabela.
# Preencher como lista de intervalos.
# Exemplos: ['1995(1)2019'], ['2002(2)2010', '2016', '2020'].
temporal_coverage:
- '2023-03-15'

# A unidade temporal com qual a tabela é atualizada.
# Opções em 'https://basedosdados.org/api/3/action/bd_available_options'
update_frequency: day

# Nível de observação da tabela: o que representa cada linha.
observation_level:
- entity: other
columns:
- table_id
- entity: other
columns:
- name

last_updated:
metadata: '2023-03-07'
data: '2023-03-07 14:54:00'
release: '2023-03-07 14:54:00'

# Versão da tabela. Seguindo o padrão de semantic versioning.
# Exemplo: v1.1.3
version: v1.0

# Quem está preenchendo esses metadados?
published_by:
name: Patrick Teixeira
email: [email protected]
github_user: tricktx
ckan_user: patrickteixeira
website:

# Qual organização/departamento/pessoa tratou os dados?
# As vezes há um ponto intermediário entre os dados originais e subir na Base dos Dados.
# Se essa pessoa é você, preencha abaixo com suas informações.
data_cleaned_by:
name: Ricardo Dahis
email: [email protected]
github_user: rdahis
ckan_user: rdahis
website: www.ricardodahis.com

# Se houve passos de tratamento, limpeza e manipulação de dados, descreva-os aqui.
data_cleaning_description:

# Url do código de limpeza dos dados.
data_cleaning_code_url:

# Organização que ajudou institucionalmente na disponibilização dos dados.
partner_organization:
name:
organization_id:

# Url dos dados originais no GCP Storage.
raw_files_url:

# Url dos arquivos auxiliares no GCP Storage.
auxiliary_files_url:

# Url da tabela de arquitetura no GCP Storage.
architecture_url:

source_bucket_name: basedosdados-dev

project_id_prod: basedosdados-dev

project_id_staging: basedosdados-dev

# Liste as colunas da tabela que representam partições.
# Não esqueça de deletar essas colunas nas tabelas .csv na hora de subir para o BigQuery.
# Isso poupará muito tempo e dinheiro às pessoas utilizando essa tabela.
# Se não houver partições, não modifique abaixo.
partitions:

# Quais são as colunas? Certifique-se de escrever uma boa descrição, as pessoas vão gostar
# para saber sobre o que é a coluna.
# Adicionar todas as colunas manualmente pode ser bastante cansativo, por isso, quando
# inicializando este arquivo de configuração, você pode apontar a função para uma amostra de dados que
# preencherá automaticamente as colunas.
# Algumas colunas existirão apenas na tabela final, você as construirá em `publish.sql`.
# Para esses, defina is_in_staging como False.
# Além disso, você deve adicionar as colunas de partição aqui e definir is_partition como True.
columns:
- name: table_id
bigquery_type: string
description: Table ID
temporal_coverage:
- (1)
covered_by_dictionary: no
directory_column:
dataset_id:
table_id:
column_name:
measurement_unit:
has_sensitive_data: no
observations:
is_in_staging: true
is_partition: false
- name: name
bigquery_type: string
description: Name
temporal_coverage:
- (1)
covered_by_dictionary: no
directory_column:
dataset_id:
table_id:
column_name:
measurement_unit:
has_sensitive_data: no
observations:
is_in_staging: true
is_partition: false
- name: bigquery_type
bigquery_type: string
description: Bigquery Type
temporal_coverage:
- (1)
covered_by_dictionary: no
directory_column:
dataset_id:
table_id:
column_name:
measurement_unit:
has_sensitive_data: no
observations:
is_in_staging: true
is_partition: false
- name: description
bigquery_type: string
description: Description
temporal_coverage:
- (1)
covered_by_dictionary: no
directory_column:
dataset_id:
table_id:
column_name:
measurement_unit:
has_sensitive_data: no
observations:
is_in_staging: true
is_partition: false
- name: temporal_coverage
bigquery_type: string
description: Temporal Coverage
temporal_coverage:
- (1)
covered_by_dictionary: no
directory_column:
dataset_id:
table_id:
column_name:
measurement_unit: year
has_sensitive_data: no
observations:
is_in_staging: true
is_partition: false
- name: covered_by_dictionary
bigquery_type: string
description: Indicates if it is covered by dictionary
temporal_coverage:
- (1)
directory_column:
dataset_id:
table_id:
column_name:
measurement_unit:
has_sensitive_data: no
observations:
is_in_staging: true
is_partition: false
- name: directory_column
bigquery_type: string
description: Directory Column
temporal_coverage:
- (1)
covered_by_dictionary: no
directory_column:
dataset_id:
table_id:
column_name:
measurement_unit:
has_sensitive_data: no
observations:
is_in_staging: true
is_partition: false
- name: measurement_unit
bigquery_type: string
description: Measurement Unit
temporal_coverage:
- (1)
covered_by_dictionary: no
directory_column:
dataset_id:
table_id:
column_name:
measurement_unit:
has_sensitive_data: no
observations:
is_in_staging: true
is_partition: false
- name: has_sensitive_data
bigquery_type: string
description: Indicates if it has sensitive data
temporal_coverage:
- (1)
covered_by_dictionary: no
directory_column:
dataset_id:
table_id:
column_name:
measurement_unit:
has_sensitive_data: no
observations:
is_in_staging: true
is_partition: false
- name: observations
bigquery_type: string
description: Observations
temporal_coverage:
- (1)
covered_by_dictionary: no
directory_column:
dataset_id:
table_id:
column_name:
measurement_unit:
has_sensitive_data: no
observations:
is_in_staging: true
is_partition: false
- name: is_in_staging
bigquery_type: string
description: Is in staging
temporal_coverage:
- (1)
covered_by_dictionary: no
directory_column:
dataset_id:
table_id:
column_name:
measurement_unit:
has_sensitive_data: no
observations:
is_in_staging: true
is_partition: false
- name: is_partition
bigquery_type: string
description: Is Partition
temporal_coverage:
- (1)
covered_by_dictionary: no
directory_column:
dataset_id:
table_id:
column_name:
measurement_unit:
has_sensitive_data: no
observations:
is_in_staging: true
is_partition: false

number_rows: 14880

metadata_modified:
41 changes: 41 additions & 0 deletions bases/br_bd_metadados/columns/table_description.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
None

Para saber mais acesse:
Website:
Github:

Ajude a manter o projeto :)
Apoia-se: https://apoia.se/basedosdados

Publicado por
-------------
Nome: None
Código:
Tratado por
-----------
Nome: None
Código:






Cobertura Temporal
------------------




Cobertura Espacial
------------------







Frequencia de Atualização
-------------------------
day
Loading

0 comments on commit 0ca8873

Please sign in to comment.