Skip to content

Commit

Permalink
Merge pull request #242 from catalyst-cooperative/datapackage_fix
Browse files Browse the repository at this point in the history
Fix datapackage generation from multiple taxonomies
  • Loading branch information
zschira authored Jul 17, 2024
2 parents d550b43 + 15ccb89 commit ff29071
Show file tree
Hide file tree
Showing 10 changed files with 147 additions and 74 deletions.
6 changes: 3 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ filings, use the command:

.. code-block:: console
$ xbrl_extract examples/ferc1-2021-sample.zip ./ferc1-2021-sample.sqlite \
$ xbrl_extract examples/ferc1-2021-sample.zip --db-path ./ferc1-2021-sample.sqlite \
--taxonomy examples/ferc1-xbrl-taxonomies.zip
The tool expects the ``--taxonomy`` option to point to a zipfile containing archived
Expand All @@ -144,7 +144,7 @@ batches of 50 filings at a time.

.. code-block:: console
$ xbrl_extract examples/ferc1-2021-sample.zip ./ferc1-2021-sample.sqlite \
$ xbrl_extract examples/ferc1-2021-sample.zip .--db-path /ferc1-2021-sample.sqlite \
--taxonomy examples/ferc1-xbrl-taxonomies.zip
--workers 5 \
--batch-size 50
Expand All @@ -160,7 +160,7 @@ filings and taxonomy, run the following command.

.. code-block:: console
$ xbrl_extract examples/ferc1-2021-sample.zip ./ferc1-2021-sample.sqlite \
$ xbrl_extract examples/ferc1-2021-sample.zip .--db-path /ferc1-2021-sample.sqlite \
--taxonomy examples/ferc1-xbrl-taxonomies.zip
--metadata-path metadata.json \
--datapackage-path datapackage.json
Binary file modified examples/ferc1-2021-sample.zip
Binary file not shown.
2 changes: 1 addition & 1 deletion src/ferc_xbrl_extractor/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def parse():
)
parser.add_argument(
"-d",
"--db_path",
"--db-path",
default="ferc-xbrl.sqlite",
help="Store data in sqlite database specified in argument",
)
Expand Down
104 changes: 91 additions & 13 deletions src/ferc_xbrl_extractor/datapackage.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
from ferc_xbrl_extractor.instance import Instance
from ferc_xbrl_extractor.taxonomy import Concept, LinkRole, Taxonomy

logger = get_logger(__name__)


class Field(BaseModel):
"""A generic field descriptor, as per Frictionless Data specs.
Expand Down Expand Up @@ -332,6 +334,47 @@ def get_period_type(self):
period_type = "instant" if "date" in self.schema_.primary_key else "duration"
return period_type

def merge_resources(self, other: "Resource", other_version: str) -> "Resource":
"""Merge same resource from multiple taxonomies.
This method attempts to merge resource definitions from multiple taxonomies
creating a unified schema for the table in question. It does this by first
comparing the primary keys of the two tables. If the primary keys aren't
exactly the same it will raise an error. For the remaining columns, this
method will check if there are any that are new or missing in ``other``.
New columns will be added to the tables schema, and missing columns will
be logged, but remain in the schema.
"""
if self.schema_.primary_key != other.schema_.primary_key:
raise RuntimeError(
f"Can't merge resource {self.name} when versions have incompatible schemas"
)
original_fields = {field.name for field in self.schema_.fields}
other_fields = {field.name for field in other.schema_.fields}

if missing_fields := original_fields - other_fields:
logger.warning(
f"The following fields were removed from table {self.name} "
f"in taxonomy version {other_version}: {missing_fields}"
)

fields = self.schema_.fields
if new_fields := other_fields - original_fields:
logger.warning(
f"The following fields were added to table {self.name} "
f"in taxonomy version {other_version}: {new_fields}"
)
# Add new fields to schema
fields += [
field for field in other.schema_.fields if field.name in new_fields
]
# Return resource with updated schema
return self.model_copy(
update={
"schema": Schema(primary_key=self.schema_.primary_key, fields=fields)
}
)


class FactTable:
"""Class to handle constructing a dataframe from an XBRL fact table.
Expand All @@ -355,7 +398,6 @@ def __init__(self, schema: Schema, period_type: str):
if field.name not in schema.primary_key
]
self.instant = period_type == "instant"
self.logger = get_logger(__name__)

def construct_dataframe(self, instance: Instance) -> pd.DataFrame:
"""Construct dataframe from a parsed XBRL instance.
Expand Down Expand Up @@ -413,24 +455,60 @@ class Datapackage(BaseModel):
resources: list[Resource]

@classmethod
def from_taxonomy(
cls, taxonomy: Taxonomy, db_uri: str, form_number: int = 1
def from_taxonomies(
cls, taxonomies: dict[str, Taxonomy], db_uri: str, form_number: int = 1
) -> "Datapackage":
"""Construct a Datapackage from an XBRL Taxonomy.
"""Construct a Datapackage from parsed XBRL taxonomies.
FERC regularly releases new versions of their XBRL taxonomies, meaning
data from different years conforms to slightly different structures. This
method will attempt to merge these taxonomy versions into a single unified
schema defined in a Datapackage descriptor.
The exact logic for merging taxonomies is as follows. First, the oldest
available taxonomy is used to construct a baseline datapackage descriptor.
Next, it will parse subsequent versions and compare the set of tables
found with the baseline. New tables will be added to the schema, removed
tables will simply be logged but remain in the schema, and tables in both
versions will do a deeper column level comparison. For more info on the table
comparison, see ``Resource.merge_resources``.
Args:
taxonomy: XBRL taxonomy which defines the structure of the database.
taxonomies: List of taxonomies to merge into a Datapackage.
db_uri: Path to database required for a Frictionless resource.
form_number: FERC form number used for datapackage name.
"""
resources = []
for role in taxonomy.roles:
for period_type in ["duration", "instant"]:
resource = Resource.from_link_role(role, period_type, db_uri)
if resource:
resources.append(resource)
resources = {}
logger.info("Attempting to merge taxonomies into a single datapackage.")
# Iterate through taxonomies in order of release and attempt to merge
for i, (taxonomy_version, taxonomy) in enumerate(sorted(taxonomies.items())):
baseline_resources = set(resources.keys())
new_resources = set()
for role in taxonomy.roles:
for period_type in ["duration", "instant"]:
if resource := Resource.from_link_role(role, period_type, db_uri):
new_resources.add(resource.name)
if resource.name not in resources:
# All resources will be new when parsing first taxonomy
if i > 0:
logger.warning(
f"Resource {resource.name} is new in {taxonomy_version}"
)
# Add new table to schema
resources[resource.name] = resource
else:
# Merge tables in both versions of taxonomy
resources[resource.name] = resources[
resource.name
].merge_resources(resource, taxonomy_version)
if missing_resources := baseline_resources - new_resources:
logger.warning(
f"The following resources were removed in {taxonomy_version}: {missing_resources}"
)

return cls(resources=resources, name=f"ferc{form_number}-extracted-xbrl")
return cls(
resources=list(resources.values()), name=f"ferc{form_number}-extracted-xbrl"
)

def get_fact_tables(
self, filter_tables: set[str] | None = None
Expand All @@ -439,7 +517,7 @@ def get_fact_tables(
Args:
filter_tables: Optionally specify the set of tables to extract.
If None, all possible tables will be extracted.
If None, all possible tables will be extracted.
"""
if filter_tables:
filtered_resources = (r for r in self.resources if r.name in filter_tables)
Expand Down
4 changes: 2 additions & 2 deletions src/ferc_xbrl_extractor/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ def from_source(
return cls(roles=roles)


def get_metadata_from_taxonomies(taxonomies: list[Taxonomy]) -> dict:
def get_metadata_from_taxonomies(taxonomies: dict[str, Taxonomy]) -> dict:
"""Get dictionary of taxonomy metadata.
XBRL taxonomies contain metadata that can be useful for interpreting reported
Expand All @@ -273,7 +273,7 @@ def get_metadata_from_taxonomies(taxonomies: list[Taxonomy]) -> dict:

duration_metadata = {}
instant_metadata = {}
for taxonomy in taxonomies:
for taxonomy in taxonomies.values():
# Get metadata for duration tables
duration_metadata.update(
{
Expand Down
40 changes: 17 additions & 23 deletions src/ferc_xbrl_extractor/xbrl.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def extract(

def table_data_from_instances(
instance_builders: list[InstanceBuilder],
table_defs: dict[str, dict[str, FactTable]],
table_defs: dict[str, FactTable],
batch_size: int | None = None,
workers: int | None = None,
) -> tuple[dict[str, pd.DataFrame], dict[str, list]]:
Expand Down Expand Up @@ -200,7 +200,7 @@ def process_instance(
logger.info(f"Extracting {instance.filing_name}")

dfs = {}
for key, table_def in table_defs[instance.taxonomy_version].items():
for key, table_def in table_defs.items():
dfs[key] = table_def.construct_dataframe(instance)

return dfs
Expand All @@ -213,7 +213,7 @@ def get_fact_tables(
filter_tables: set[str] | None = None,
datapackage_path: str | None = None,
metadata_path: str | None = None,
) -> dict[str, dict[str, FactTable]]:
) -> dict[str, FactTable]:
"""Parse taxonomy from URL.
XBRL defines 'fact tables' that groups related facts. These fact
Expand All @@ -238,7 +238,7 @@ def get_fact_tables(
Returns:
Dictionary mapping to table names to structure.
"""
taxonomies = []
taxonomies = {}
fact_tables = {}
metadata = {}
with ZipFile(taxonomy_source, "r") as taxonomy_archive:
Expand All @@ -252,30 +252,24 @@ def get_fact_tables(

taxonomy_entry_point = f"taxonomy/form{form_number}/{taxonomy_date}/form/form{form_number}/form-{form_number}_{taxonomy_date}.xsd"
taxonomy = Taxonomy.from_source(f, entry_point=taxonomy_entry_point)
taxonomies.append(taxonomy)
taxonomies[taxonomy_version] = taxonomy

datapackage = Datapackage.from_taxonomy(
taxonomy, db_uri, form_number=form_number
)
datapackage = Datapackage.from_taxonomies(
taxonomies, db_uri, form_number=form_number
)

if datapackage_path:
# Verify that datapackage descriptor is valid before outputting
report = Package.validate_descriptor(
datapackage.model_dump(by_alias=True)
)
if datapackage_path:
# Verify that datapackage descriptor is valid before outputting
report = Package.validate_descriptor(datapackage.model_dump(by_alias=True))

if not report.valid:
raise RuntimeError(
f"Generated datapackage is invalid - {report.errors}"
)
if not report.valid:
raise RuntimeError(f"Generated datapackage is invalid - {report.errors}")

# Write to JSON file
with Path(datapackage_path).open(mode="w") as f:
f.write(datapackage.model_dump_json(by_alias=True))
# Write to JSON file
with Path(datapackage_path).open(mode="w") as f:
f.write(datapackage.model_dump_json(by_alias=True, indent=2))

fact_tables[taxonomy_version] = datapackage.get_fact_tables(
filter_tables=filter_tables
)
fact_tables = datapackage.get_fact_tables(filter_tables=filter_tables)

# Save taxonomy metadata
metadata = get_metadata_from_taxonomies(taxonomies)
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/console_scripts_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def test_extract_example_filings(script_runner, tmp_path, test_dir):
[
"xbrl_extract",
str(data_dir / "ferc1-xbrl-2021.zip"),
"--db_path",
"--db-path",
str(out_db),
"--taxonomy",
str(data_dir / "ferc1-xbrl-taxonomies.zip"),
Expand Down
Binary file modified tests/integration/data/ferc1-xbrl-taxonomies.zip
Binary file not shown.
18 changes: 4 additions & 14 deletions tests/integration/data_quality_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def extracted(metadata_dir, data_dir, request) -> ExtractOutput:


def test_lost_facts_pct(extracted, request):
table_defs_map, table_data, stats = extracted
table_defs, table_data, stats = extracted
total_facts = sum(
instance_stats["total_facts"] for instance_stats in stats.values()
)
Expand Down Expand Up @@ -65,16 +65,8 @@ def test_lost_facts_pct(extracted, request):
assert instance_used_ratio > per_filing_threshold and instance_used_ratio <= 1


def _get_relevant_table_defs(table_defs_map: dict):
# Note: this just grabs table_defs from a random version of the taxonomy.
# The taxonomy versions are close enough that this works for now, but this
# could break tests in the future.
return list(table_defs_map.values())[0]


def test_publication_time(extracted):
table_defs_map, table_data, _stats = extracted
table_defs = _get_relevant_table_defs(table_defs_map)
table_defs, table_data, _stats = extracted

for table_name, table in table_defs.items():
assert (
Expand All @@ -86,8 +78,7 @@ def test_publication_time(extracted):


def test_all_data_has_corresponding_id(extracted):
table_defs_map, table_data, _stats = extracted
table_defs = _get_relevant_table_defs(table_defs_map)
table_defs, table_data, _stats = extracted

[id_table_name] = [
name
Expand All @@ -109,8 +100,7 @@ def test_all_data_has_corresponding_id(extracted):


def test_null_values(extracted):
table_defs_map, table_data, _stats = extracted
table_defs = _get_relevant_table_defs(table_defs_map)
table_defs, table_data, _stats = extracted

for table_name, table in table_defs.items():
dataframe = table_data[table_name]
Expand Down
45 changes: 28 additions & 17 deletions tests/integration/datapackage_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,28 +23,39 @@

def test_datapackage_generation(test_dir, data_dir):
"""Test that datapackage descriptor is valid."""
with (
zipfile.ZipFile(data_dir / "ferc1-xbrl-taxonomies.zip") as archive,
archive.open("form-1-2022-01-01.zip", mode="r") as f,
):
taxonomy = Taxonomy.from_source(
f,
entry_point=Path(
"taxonomy/form1/2022-01-01/form/form1/form-1_2022-01-01.xsd"
),
)
datapackage = Datapackage.from_taxonomy(taxonomy, "sqlite:///test_db.sqlite")

filtered_tables = datapackage.get_fact_tables(
filter_tables={"identification_001_duration"}
)
assert set(filtered_tables.keys()) == {"identification_001_duration"}
taxonomies = {}
for version, entry_point in [
(
"form-1-2022-01-01.zip",
"taxonomy/form1/2022-01-01/form/form1/form-1_2022-01-01.xsd",
),
(
"form-1-2023-11-01.zip",
"taxonomy/form1/2023-11-01/form/form1/form-1_2023-11-01.xsd",
),
]:
with (
zipfile.ZipFile(data_dir / "ferc1-xbrl-taxonomies.zip") as archive,
archive.open(version, mode="r") as f,
):
taxonomies[version] = Taxonomy.from_source(
f,
entry_point=Path(entry_point),
)
datapackage = Datapackage.from_taxonomies(taxonomies, "sqlite:///test_db.sqlite")

filter_tables = {
"identification_001_duration",
"energy_storage_operations_small_plants_419_duration",
}
filtered_tables = datapackage.get_fact_tables(filter_tables=filter_tables)
assert set(filtered_tables.keys()) == filter_tables

all_tables = datapackage.get_fact_tables()

# 366 was just the value we had - this assertion is more of a regression
# test than a normative statement
assert len(all_tables) == 366
assert len(all_tables) == 370

assert Package.validate_descriptor(datapackage.model_dump(by_alias=True))

Expand Down

0 comments on commit ff29071

Please sign in to comment.