Merge pull request #242 from catalyst-cooperative/datapackage_fix

Fix datapackage generation from multiple taxonomies
catalyst-cooperative · Jul 17, 2024 · ff29071 · ff29071
2 parents d550b43 + 15ccb89
commit ff29071
Show file tree

Hide file tree

Showing 10 changed files with 147 additions and 74 deletions.
diff --git a/README.rst b/README.rst
@@ -124,7 +124,7 @@ filings, use the command:
 
 .. code-block:: console
 
-    $ xbrl_extract examples/ferc1-2021-sample.zip ./ferc1-2021-sample.sqlite \
+    $ xbrl_extract examples/ferc1-2021-sample.zip --db-path ./ferc1-2021-sample.sqlite \
         --taxonomy examples/ferc1-xbrl-taxonomies.zip
 
 The tool expects the ``--taxonomy`` option to point to a zipfile containing archived
@@ -144,7 +144,7 @@ batches of 50 filings at a time.
 
 .. code-block:: console
 
-    $ xbrl_extract examples/ferc1-2021-sample.zip ./ferc1-2021-sample.sqlite \
+    $ xbrl_extract examples/ferc1-2021-sample.zip .--db-path /ferc1-2021-sample.sqlite \
         --taxonomy examples/ferc1-xbrl-taxonomies.zip
         --workers 5 \
         --batch-size 50
@@ -160,7 +160,7 @@ filings and taxonomy, run the following command.
 
 .. code-block:: console
 
-    $ xbrl_extract examples/ferc1-2021-sample.zip ./ferc1-2021-sample.sqlite \
+    $ xbrl_extract examples/ferc1-2021-sample.zip .--db-path /ferc1-2021-sample.sqlite \
         --taxonomy examples/ferc1-xbrl-taxonomies.zip
         --metadata-path metadata.json \
         --datapackage-path datapackage.json
diff --git a/examples/ferc1-2021-sample.zip b/examples/ferc1-2021-sample.zip
diff --git a/src/ferc_xbrl_extractor/cli.py b/src/ferc_xbrl_extractor/cli.py
@@ -24,7 +24,7 @@ def parse():
     )
     parser.add_argument(
         "-d",
-        "--db_path",
+        "--db-path",
         default="ferc-xbrl.sqlite",
         help="Store data in sqlite database specified in argument",
     )

diff --git a/src/ferc_xbrl_extractor/datapackage.py b/src/ferc_xbrl_extractor/datapackage.py
@@ -13,6 +13,8 @@
 from ferc_xbrl_extractor.instance import Instance
 from ferc_xbrl_extractor.taxonomy import Concept, LinkRole, Taxonomy
 
+logger = get_logger(__name__)
+
 
 class Field(BaseModel):
     """A generic field descriptor, as per Frictionless Data specs.
@@ -332,6 +334,47 @@ def get_period_type(self):
         period_type = "instant" if "date" in self.schema_.primary_key else "duration"
         return period_type
 
+    def merge_resources(self, other: "Resource", other_version: str) -> "Resource":
+        """Merge same resource from multiple taxonomies.
+
+        This method attempts to merge resource definitions from multiple taxonomies
+        creating a unified schema for the table in question. It does this by first
+        comparing the primary keys of the two tables. If the primary keys aren't
+        exactly the same it will raise an error. For the remaining columns, this
+        method will check if there are any that are new or missing in ``other``.
+        New columns will be added to the tables schema, and missing columns will
+        be logged, but remain in the schema.
+        """
+        if self.schema_.primary_key != other.schema_.primary_key:
+            raise RuntimeError(
+                f"Can't merge resource {self.name} when versions have incompatible schemas"
+            )
+        original_fields = {field.name for field in self.schema_.fields}
+        other_fields = {field.name for field in other.schema_.fields}
+
+        if missing_fields := original_fields - other_fields:
+            logger.warning(
+                f"The following fields were removed from table {self.name} "
+                f"in taxonomy version {other_version}: {missing_fields}"
+            )
+
+        fields = self.schema_.fields
+        if new_fields := other_fields - original_fields:
+            logger.warning(
+                f"The following fields were added to table {self.name} "
+                f"in taxonomy version {other_version}: {new_fields}"
+            )
+            # Add new fields to schema
+            fields += [
+                field for field in other.schema_.fields if field.name in new_fields
+            ]
+        # Return resource with updated schema
+        return self.model_copy(
+            update={
+                "schema": Schema(primary_key=self.schema_.primary_key, fields=fields)
+            }
+        )
+
 
 class FactTable:
     """Class to handle constructing a dataframe from an XBRL fact table.
@@ -355,7 +398,6 @@ def __init__(self, schema: Schema, period_type: str):
             if field.name not in schema.primary_key
         ]
         self.instant = period_type == "instant"
-        self.logger = get_logger(__name__)
 
     def construct_dataframe(self, instance: Instance) -> pd.DataFrame:
         """Construct dataframe from a parsed XBRL instance.
@@ -413,24 +455,60 @@ class Datapackage(BaseModel):
     resources: list[Resource]
 
     @classmethod
-    def from_taxonomy(
-        cls, taxonomy: Taxonomy, db_uri: str, form_number: int = 1
+    def from_taxonomies(
+        cls, taxonomies: dict[str, Taxonomy], db_uri: str, form_number: int = 1
     ) -> "Datapackage":
-        """Construct a Datapackage from an XBRL Taxonomy.
+        """Construct a Datapackage from parsed XBRL taxonomies.
+
+        FERC regularly releases new versions of their XBRL taxonomies, meaning
+        data from different years conforms to slightly different structures. This
+        method will attempt to merge these taxonomy versions into a single unified
+        schema defined in a Datapackage descriptor.
+
+        The exact logic for merging taxonomies is as follows. First, the oldest
+        available taxonomy is used to construct a baseline datapackage descriptor.
+        Next, it will parse subsequent versions and compare the set of tables
+        found with the baseline. New tables will be added to the schema, removed
+        tables will simply be logged but remain in the schema, and tables in both
+        versions will do a deeper column level comparison. For more info on the table
+        comparison, see ``Resource.merge_resources``.
 
         Args:
-            taxonomy: XBRL taxonomy which defines the structure of the database.
+            taxonomies: List of taxonomies to merge into a Datapackage.
             db_uri: Path to database required for a Frictionless resource.
             form_number: FERC form number used for datapackage name.
         """
-        resources = []
-        for role in taxonomy.roles:
-            for period_type in ["duration", "instant"]:
-                resource = Resource.from_link_role(role, period_type, db_uri)
-                if resource:
-                    resources.append(resource)
+        resources = {}
+        logger.info("Attempting to merge taxonomies into a single datapackage.")
+        # Iterate through taxonomies in order of release and attempt to merge
+        for i, (taxonomy_version, taxonomy) in enumerate(sorted(taxonomies.items())):
+            baseline_resources = set(resources.keys())
+            new_resources = set()
+            for role in taxonomy.roles:
+                for period_type in ["duration", "instant"]:
+                    if resource := Resource.from_link_role(role, period_type, db_uri):
+                        new_resources.add(resource.name)
+                        if resource.name not in resources:
+                            # All resources will be new when parsing first taxonomy
+                            if i > 0:
+                                logger.warning(
+                                    f"Resource {resource.name} is new in {taxonomy_version}"
+                                )
+                            # Add new table to schema
+                            resources[resource.name] = resource
+                        else:
+                            # Merge tables in both versions of taxonomy
+                            resources[resource.name] = resources[
+                                resource.name
+                            ].merge_resources(resource, taxonomy_version)
+            if missing_resources := baseline_resources - new_resources:
+                logger.warning(
+                    f"The following resources were removed in {taxonomy_version}: {missing_resources}"
+                )
 
-        return cls(resources=resources, name=f"ferc{form_number}-extracted-xbrl")
+        return cls(
+            resources=list(resources.values()), name=f"ferc{form_number}-extracted-xbrl"
+        )
 
     def get_fact_tables(
         self, filter_tables: set[str] | None = None
@@ -439,7 +517,7 @@ def get_fact_tables(
 
         Args:
             filter_tables: Optionally specify the set of tables to extract.
-                    If None, all possible tables will be extracted.
+                If None, all possible tables will be extracted.
         """
         if filter_tables:
             filtered_resources = (r for r in self.resources if r.name in filter_tables)

diff --git a/src/ferc_xbrl_extractor/taxonomy.py b/src/ferc_xbrl_extractor/taxonomy.py
@@ -262,7 +262,7 @@ def from_source(
         return cls(roles=roles)
 
 
-def get_metadata_from_taxonomies(taxonomies: list[Taxonomy]) -> dict:
+def get_metadata_from_taxonomies(taxonomies: dict[str, Taxonomy]) -> dict:
     """Get dictionary of taxonomy metadata.
 
     XBRL taxonomies contain metadata that can be useful for interpreting reported
@@ -273,7 +273,7 @@ def get_metadata_from_taxonomies(taxonomies: list[Taxonomy]) -> dict:
 
     duration_metadata = {}
     instant_metadata = {}
-    for taxonomy in taxonomies:
+    for taxonomy in taxonomies.values():
         # Get metadata for duration tables
         duration_metadata.update(
             {

diff --git a/src/ferc_xbrl_extractor/xbrl.py b/src/ferc_xbrl_extractor/xbrl.py
@@ -83,7 +83,7 @@ def extract(
 
 def table_data_from_instances(
     instance_builders: list[InstanceBuilder],
-    table_defs: dict[str, dict[str, FactTable]],
+    table_defs: dict[str, FactTable],
     batch_size: int | None = None,
     workers: int | None = None,
 ) -> tuple[dict[str, pd.DataFrame], dict[str, list]]:
@@ -200,7 +200,7 @@ def process_instance(
     logger.info(f"Extracting {instance.filing_name}")
 
     dfs = {}
-    for key, table_def in table_defs[instance.taxonomy_version].items():
+    for key, table_def in table_defs.items():
         dfs[key] = table_def.construct_dataframe(instance)
 
     return dfs
@@ -213,7 +213,7 @@ def get_fact_tables(
     filter_tables: set[str] | None = None,
     datapackage_path: str | None = None,
     metadata_path: str | None = None,
-) -> dict[str, dict[str, FactTable]]:
+) -> dict[str, FactTable]:
     """Parse taxonomy from URL.
 
     XBRL defines 'fact tables' that groups related facts. These fact
@@ -238,7 +238,7 @@ def get_fact_tables(
     Returns:
         Dictionary mapping to table names to structure.
     """
-    taxonomies = []
+    taxonomies = {}
     fact_tables = {}
     metadata = {}
     with ZipFile(taxonomy_source, "r") as taxonomy_archive:
@@ -252,30 +252,24 @@ def get_fact_tables(
 
                 taxonomy_entry_point = f"taxonomy/form{form_number}/{taxonomy_date}/form/form{form_number}/form-{form_number}_{taxonomy_date}.xsd"
                 taxonomy = Taxonomy.from_source(f, entry_point=taxonomy_entry_point)
-                taxonomies.append(taxonomy)
+                taxonomies[taxonomy_version] = taxonomy
 
-            datapackage = Datapackage.from_taxonomy(
-                taxonomy, db_uri, form_number=form_number
-            )
+    datapackage = Datapackage.from_taxonomies(
+        taxonomies, db_uri, form_number=form_number
+    )
 
-            if datapackage_path:
-                # Verify that datapackage descriptor is valid before outputting
-                report = Package.validate_descriptor(
-                    datapackage.model_dump(by_alias=True)
-                )
+    if datapackage_path:
+        # Verify that datapackage descriptor is valid before outputting
+        report = Package.validate_descriptor(datapackage.model_dump(by_alias=True))
 
-                if not report.valid:
-                    raise RuntimeError(
-                        f"Generated datapackage is invalid - {report.errors}"
-                    )
+        if not report.valid:
+            raise RuntimeError(f"Generated datapackage is invalid - {report.errors}")
 
-                # Write to JSON file
-                with Path(datapackage_path).open(mode="w") as f:
-                    f.write(datapackage.model_dump_json(by_alias=True))
+        # Write to JSON file
+        with Path(datapackage_path).open(mode="w") as f:
+            f.write(datapackage.model_dump_json(by_alias=True, indent=2))
 
-            fact_tables[taxonomy_version] = datapackage.get_fact_tables(
-                filter_tables=filter_tables
-            )
+    fact_tables = datapackage.get_fact_tables(filter_tables=filter_tables)
 
     # Save taxonomy metadata
     metadata = get_metadata_from_taxonomies(taxonomies)

diff --git a/tests/integration/console_scripts_test.py b/tests/integration/console_scripts_test.py
@@ -41,7 +41,7 @@ def test_extract_example_filings(script_runner, tmp_path, test_dir):
         [
             "xbrl_extract",
             str(data_dir / "ferc1-xbrl-2021.zip"),
-            "--db_path",
+            "--db-path",
             str(out_db),
             "--taxonomy",
             str(data_dir / "ferc1-xbrl-taxonomies.zip"),

diff --git a/tests/integration/data/ferc1-xbrl-taxonomies.zip b/tests/integration/data/ferc1-xbrl-taxonomies.zip
diff --git a/tests/integration/data_quality_test.py b/tests/integration/data_quality_test.py
@@ -35,7 +35,7 @@ def extracted(metadata_dir, data_dir, request) -> ExtractOutput:
 
 
 def test_lost_facts_pct(extracted, request):
-    table_defs_map, table_data, stats = extracted
+    table_defs, table_data, stats = extracted
     total_facts = sum(
         instance_stats["total_facts"] for instance_stats in stats.values()
     )
@@ -65,16 +65,8 @@ def test_lost_facts_pct(extracted, request):
         assert instance_used_ratio > per_filing_threshold and instance_used_ratio <= 1
 
 
-def _get_relevant_table_defs(table_defs_map: dict):
-    # Note: this just grabs table_defs from a random version of the taxonomy.
-    # The taxonomy versions are close enough that this works for now, but this
-    # could break tests in the future.
-    return list(table_defs_map.values())[0]
-
-
 def test_publication_time(extracted):
-    table_defs_map, table_data, _stats = extracted
-    table_defs = _get_relevant_table_defs(table_defs_map)
+    table_defs, table_data, _stats = extracted
 
     for table_name, table in table_defs.items():
         assert (
@@ -86,8 +78,7 @@ def test_publication_time(extracted):
 
 
 def test_all_data_has_corresponding_id(extracted):
-    table_defs_map, table_data, _stats = extracted
-    table_defs = _get_relevant_table_defs(table_defs_map)
+    table_defs, table_data, _stats = extracted
 
     [id_table_name] = [
         name
@@ -109,8 +100,7 @@ def test_all_data_has_corresponding_id(extracted):
 
 
 def test_null_values(extracted):
-    table_defs_map, table_data, _stats = extracted
-    table_defs = _get_relevant_table_defs(table_defs_map)
+    table_defs, table_data, _stats = extracted
 
     for table_name, table in table_defs.items():
         dataframe = table_data[table_name]

diff --git a/tests/integration/datapackage_test.py b/tests/integration/datapackage_test.py
@@ -23,28 +23,39 @@
 
 def test_datapackage_generation(test_dir, data_dir):
     """Test that datapackage descriptor is valid."""
-    with (
-        zipfile.ZipFile(data_dir / "ferc1-xbrl-taxonomies.zip") as archive,
-        archive.open("form-1-2022-01-01.zip", mode="r") as f,
-    ):
-        taxonomy = Taxonomy.from_source(
-            f,
-            entry_point=Path(
-                "taxonomy/form1/2022-01-01/form/form1/form-1_2022-01-01.xsd"
-            ),
-        )
-    datapackage = Datapackage.from_taxonomy(taxonomy, "sqlite:///test_db.sqlite")
-
-    filtered_tables = datapackage.get_fact_tables(
-        filter_tables={"identification_001_duration"}
-    )
-    assert set(filtered_tables.keys()) == {"identification_001_duration"}
+    taxonomies = {}
+    for version, entry_point in [
+        (
+            "form-1-2022-01-01.zip",
+            "taxonomy/form1/2022-01-01/form/form1/form-1_2022-01-01.xsd",
+        ),
+        (
+            "form-1-2023-11-01.zip",
+            "taxonomy/form1/2023-11-01/form/form1/form-1_2023-11-01.xsd",
+        ),
+    ]:
+        with (
+            zipfile.ZipFile(data_dir / "ferc1-xbrl-taxonomies.zip") as archive,
+            archive.open(version, mode="r") as f,
+        ):
+            taxonomies[version] = Taxonomy.from_source(
+                f,
+                entry_point=Path(entry_point),
+            )
+    datapackage = Datapackage.from_taxonomies(taxonomies, "sqlite:///test_db.sqlite")
+
+    filter_tables = {
+        "identification_001_duration",
+        "energy_storage_operations_small_plants_419_duration",
+    }
+    filtered_tables = datapackage.get_fact_tables(filter_tables=filter_tables)
+    assert set(filtered_tables.keys()) == filter_tables
 
     all_tables = datapackage.get_fact_tables()
 
     # 366 was just the value we had - this assertion is more of a regression
     # test than a normative statement
-    assert len(all_tables) == 366
+    assert len(all_tables) == 370
 
     assert Package.validate_descriptor(datapackage.model_dump(by_alias=True))