Skip to content

Commit

Permalink
scraped_gazettes: propaga modificacoes pelo repositorio
Browse files Browse the repository at this point in the history
  • Loading branch information
trevineju committed Feb 3, 2025
1 parent fa9ebd9 commit f5b6c7e
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 4 deletions.
1 change: 1 addition & 0 deletions data_collection/gazette/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,4 @@ class Gazette(scrapy.Item):
document = scrapy.Field()
document_sequence = scrapy.Field()
granularity = scrapy.Field()
spider_name = scrapy.Field()
7 changes: 7 additions & 0 deletions data_collection/gazette/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def process_item(self, item, spider):
class DefaultValuesPipeline:
def process_item(self, item, spider):
item["public_entity_id"] = getattr(spider, "PUBLIC_ENTITY_ID")
item["spider_name"] = getattr(spider, "name")

# Date manipulation to allow jsonschema to validate correctly
item["date"] = str(item["date"])
Expand Down Expand Up @@ -79,6 +80,12 @@ def process_item(self, item, spider):
"power",
"scraped_at",
"public_entity_id",
"act_category",
"publishing_body",
"document",
"document_sequence",
"granularity",
"spider_name",
]
gazette_item = {field: item.get(field) for field in fields}
gazette_item["date"] = dt.datetime.strptime(
Expand Down
39 changes: 35 additions & 4 deletions data_collection/gazette/resources/gazette_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"type": "object",
"properties": {
"date": {
"description": "Date of the gazzete",
"description": "Date of the gazette",
"type": "string",
"format": "date"
},
Expand All @@ -19,8 +19,12 @@
"type": "string"
}
},
"edition_number": {
"description": "Publication edition number of the gazette",
"type": "string"
},
"is_extra_edition": {
"description": "Determines if the edition of the gazzete is extra or not",
"description": "Determines if the edition of the gazette is extra or not",
"type": "boolean"
},
"public_entity_id": {
Expand All @@ -34,11 +38,37 @@
"executive_legislative"
]
},
"scraped_at":{
"description": "When the gazzete was scraped",
"scraped_at": {
"description": "When the gazette was scraped",
"type": "string",
"format": "date-time"
},
"act_category": {
"description": "...",
"type": "string"
},
"publishing_body": {
"description": "...",
"type": "string"
},
"document": {
"description": "...",
"type": "string"
},
"document_sequence": {
"description": "...",
"type": "integer"
},
"granularity": {
"description": "...",
"type": "string",
"enum": [
"act",
"section",
"individual",
"aggregate"
]
},
"files": {
"type": "array",
"minItems": 1,
Expand All @@ -65,6 +95,7 @@
"date",
"public_entity_id",
"power",
"granularity",
"scraped_at"
],
"anyOf": [
Expand Down

0 comments on commit f5b6c7e

Please sign in to comment.