diff --git a/pipelines/datasets/br_cgu_cartao_pagamento/schedules.py b/pipelines/datasets/br_cgu_cartao_pagamento/schedules.py index c9bd50a4c..99ec99fa3 100644 --- a/pipelines/datasets/br_cgu_cartao_pagamento/schedules.py +++ b/pipelines/datasets/br_cgu_cartao_pagamento/schedules.py @@ -5,6 +5,7 @@ from pipelines.constants import constants from pipelines.utils.crawler_cgu.constants import constants as constants_cgu + every_day_microdados_governo_federal = Schedule( clocks=[ CronClock( diff --git a/pipelines/utils/crawler_cgu/constants.py b/pipelines/utils/crawler_cgu/constants.py index ab32534c9..0f31a39fd 100644 --- a/pipelines/utils/crawler_cgu/constants.py +++ b/pipelines/utils/crawler_cgu/constants.py @@ -175,3 +175,4 @@ class constants(Enum): # pylint: disable=c0103 "READ": "_TermoAditivo.csv", }, } + diff --git a/pipelines/utils/crawler_cgu/flows.py b/pipelines/utils/crawler_cgu/flows.py index d5a09c638..99d28f291 100644 --- a/pipelines/utils/crawler_cgu/flows.py +++ b/pipelines/utils/crawler_cgu/flows.py @@ -32,7 +32,7 @@ ) as flow_cgu_cartao_pagamento: dataset_id = Parameter("dataset_id", default='br_cgu_cartao_pagamento', required=True) - table_id = Parameter("table_id", default ="microdados_governo_federal", required=True) + table_id = Parameter("table_id", required=True) #### # Relative_month = 1 means that the data will be downloaded for the current month #### @@ -317,3 +317,5 @@ ) flow_cgu_licitacao_contrato.storage = GCS(constants.GCS_FLOWS_BUCKET.value) flow_cgu_licitacao_contrato.run_config = KubernetesRun(image=constants.DOCKER_IMAGE.value) + + diff --git a/pipelines/utils/crawler_cgu/tasks.py b/pipelines/utils/crawler_cgu/tasks.py index bec196b99..99b4a47ee 100644 --- a/pipelines/utils/crawler_cgu/tasks.py +++ b/pipelines/utils/crawler_cgu/tasks.py @@ -42,8 +42,8 @@ def partition_data(table_id: str, dataset_id : str) -> str: log("---------------------------- Read data ----------------------------") df = read_csv(dataset_id = dataset_id, table_id = table_id) log(df.head()) - if dataset_id == "br_cgu_cartao_pagamento:": - log(" ---------------------------- Partiting data -----------------------") + if dataset_id == "br_cgu_cartao_pagamento": + log("---------------------------- Partiting data -----------------------") to_partitions( data = df, partition_columns=['ANO_EXTRATO', 'MES_EXTRATO'], @@ -77,6 +77,7 @@ def partition_data(table_id: str, dataset_id : str) -> str: log("---------------------------- Data partitioned ----------------------") return constants.TABELA_SERVIDORES.value[table_id]['OUTPUT'] + @task def get_current_date_and_download_file(table_id : str, dataset_id : str, diff --git a/pipelines/utils/crawler_cgu/utils.py b/pipelines/utils/crawler_cgu/utils.py index 1746b0022..ac243adbf 100644 --- a/pipelines/utils/crawler_cgu/utils.py +++ b/pipelines/utils/crawler_cgu/utils.py @@ -2,6 +2,7 @@ """ General purpose functions for the br_cgu_cartao_pagamento project """ + import datetime from arrow import get from dateutil.relativedelta import relativedelta