diff --git a/malariagen_data/af1.py b/malariagen_data/af1.py index bc0a6141..84594430 100644 --- a/malariagen_data/af1.py +++ b/malariagen_data/af1.py @@ -89,6 +89,8 @@ def __init__( discordant_read_calls_analysis=None, pre=False, tqdm_class=None, + unrestricted_use_only=False, + surveillance_use_only=False, **storage_options, # used by fsspec via init_filesystem() ): super().__init__( @@ -124,6 +126,8 @@ def __init__( virtual_contigs=None, gene_names=None, inversion_tag_path=None, + unrestricted_use_only=unrestricted_use_only, + surveillance_use_only=surveillance_use_only, ) def __repr__(self): @@ -136,6 +140,8 @@ def __repr__(self): f"Site filters analysis : {self._site_filters_analysis}\n" f"Software version : malariagen_data {malariagen_data.__version__}\n" f"Client location : {self.client_location}\n" + f"Data filtered to unrestricted use only: {self._unrestricted_use_only}\n" + f"Data filtered to surveillance use only: {self._surveillance_use_only}\n" f"---\n" f"Please note that data are subject to terms of use,\n" f"for more information see https://www.malariagen.net/data\n" @@ -201,6 +207,18 @@ def _repr_html_(self): {self.client_location} + + + Data filtered for unrestricted use only + + {self._unrestricted_use_only} + + + + Data filtered for surveillance use only + + {self._surveillance_use_only} + """ diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py index 443c594b..3dfdb4f9 100644 --- a/malariagen_data/ag3.py +++ b/malariagen_data/ag3.py @@ -150,6 +150,8 @@ def __init__( discordant_read_calls_analysis=None, pre=False, tqdm_class=None, + unrestricted_use_only=False, + surveillance_use_only=False, **storage_options, # used by fsspec via init_filesystem() ): super().__init__( @@ -192,6 +194,8 @@ def __init__( virtual_contigs=VIRTUAL_CONTIGS, gene_names=GENE_NAMES, inversion_tag_path=INVERSION_TAG_PATH, + unrestricted_use_only=unrestricted_use_only, + surveillance_use_only=surveillance_use_only, ) # set up caches @@ -218,6 +222,8 @@ def __repr__(self): f"Site filters analysis : {self._site_filters_analysis}\n" f"Software version : malariagen_data {malariagen_data.__version__}\n" f"Client location : {self.client_location}\n" + f"Data filtered to unrestricted use only: {self._unrestricted_use_only}\n" + f"Data filtered to surveillance use only: {self._surveillance_use_only}\n" f"---\n" f"Please note that data are subject to terms of use,\n" f"for more information see https://www.malariagen.net/data\n" @@ -289,6 +295,18 @@ def _repr_html_(self): {self.client_location} + + + Data filtered for unrestricted use only + + {self._unrestricted_use_only} + + + + Data filtered for surveillance use only + + {self._surveillance_use_only} + """ diff --git a/malariagen_data/anoph/base.py b/malariagen_data/anoph/base.py index a214e6fb..3cbaf75b 100644 --- a/malariagen_data/anoph/base.py +++ b/malariagen_data/anoph/base.py @@ -61,6 +61,8 @@ def __init__( storage_options: Optional[Mapping] = None, results_cache: Optional[str] = None, tqdm_class=None, + unrestricted_use_only: Optional[bool] = False, + surveillance_use_only: Optional[bool] = False, ): # If show_progress has not been specified, then determine the default. if show_progress is None: @@ -85,6 +87,8 @@ def __init__( if tqdm_class is None: tqdm_class = tqdm_auto self._tqdm_class = tqdm_class + self._unrestricted_use_only = unrestricted_use_only + self._surveillance_use_only = surveillance_use_only # Set up logging. self._log = LoggingHelper(name=__name__, out=log, debug=debug) @@ -406,6 +410,7 @@ def _read_sample_sets(self, *, single_release: str): `terms_of_use_url` is the URL of the terms of use, `release` is the identifier of the release containing the sample set, `unrestricted_use` whether the sample set can be without restriction (e.g., if the terms of use have expired). + If `unrestricted_use_only` is set to `True` then only sample sets with `unrestricted_use` set to `True` will be included. """, ) def sample_sets( @@ -428,6 +433,11 @@ def sample_sets( except KeyError: # Read and cache dataframe for performance. df = self._read_sample_sets(single_release=release) + + # If unrestricted_use_only, restrict to sample sets with unrestricted_use. + if "unrestricted_use" in df.columns and self._unrestricted_use_only: + df = df[df["unrestricted_use"].astype(bool)] + self._cache_sample_sets[release] = df elif isinstance(release, Sequence): diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index 4a2f63df..07856e62 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -141,6 +141,8 @@ def __init__( virtual_contigs: Optional[Mapping[str, Sequence[str]]], gene_names: Optional[Mapping[str, str]], inversion_tag_path: Optional[str], + unrestricted_use_only: Optional[bool], + surveillance_use_only: Optional[bool], ): super().__init__( url=url, @@ -175,6 +177,8 @@ def __init__( virtual_contigs=virtual_contigs, gene_names=gene_names, inversion_tag_path=inversion_tag_path, + unrestricted_use_only=unrestricted_use_only, + surveillance_use_only=surveillance_use_only, ) @property diff --git a/tests/anoph/test_sample_metadata.py b/tests/anoph/test_sample_metadata.py index e5b8ec8e..46e84a8b 100644 --- a/tests/anoph/test_sample_metadata.py +++ b/tests/anoph/test_sample_metadata.py @@ -7,7 +7,7 @@ import plotly.graph_objects as go # type: ignore import pytest from pandas.testing import assert_frame_equal -from pytest_cases import parametrize_with_cases +from pytest_cases import parametrize_with_cases, case from typeguard import suppress_type_checks from malariagen_data import af1 as _af1 @@ -36,6 +36,73 @@ def ag3_sim_api(ag3_sim_fixture): ) +@pytest.fixture +def ag3_sim_unrestricted_use_only_api(ag3_sim_fixture): + return AnophelesSampleMetadata( + url=ag3_sim_fixture.url, + config_path=_ag3.CONFIG_PATH, + major_version_number=_ag3.MAJOR_VERSION_NUMBER, + major_version_path=_ag3.MAJOR_VERSION_PATH, + pre=True, + aim_metadata_dtype={ + "aim_species_fraction_arab": "float64", + "aim_species_fraction_colu": "float64", + "aim_species_fraction_colu_no2l": "float64", + "aim_species_gambcolu_arabiensis": object, + "aim_species_gambiae_coluzzii": object, + "aim_species": object, + }, + taxon_colors=_ag3.TAXON_COLORS, + cohorts_analysis=ag3_sim_fixture.config["DEFAULT_COHORTS_ANALYSIS"], + unrestricted_use_only=True, + ) + + +@pytest.fixture +def ag3_sim_surveillance_use_only_api(ag3_sim_fixture): + return AnophelesSampleMetadata( + url=ag3_sim_fixture.url, + config_path=_ag3.CONFIG_PATH, + major_version_number=_ag3.MAJOR_VERSION_NUMBER, + major_version_path=_ag3.MAJOR_VERSION_PATH, + pre=True, + aim_metadata_dtype={ + "aim_species_fraction_arab": "float64", + "aim_species_fraction_colu": "float64", + "aim_species_fraction_colu_no2l": "float64", + "aim_species_gambcolu_arabiensis": object, + "aim_species_gambiae_coluzzii": object, + "aim_species": object, + }, + taxon_colors=_ag3.TAXON_COLORS, + cohorts_analysis=ag3_sim_fixture.config["DEFAULT_COHORTS_ANALYSIS"], + surveillance_use_only=True, + ) + + +@pytest.fixture +def ag3_sim_unrestricted_surveillance_use_only_api(ag3_sim_fixture): + return AnophelesSampleMetadata( + url=ag3_sim_fixture.url, + config_path=_ag3.CONFIG_PATH, + major_version_number=_ag3.MAJOR_VERSION_NUMBER, + major_version_path=_ag3.MAJOR_VERSION_PATH, + pre=True, + aim_metadata_dtype={ + "aim_species_fraction_arab": "float64", + "aim_species_fraction_colu": "float64", + "aim_species_fraction_colu_no2l": "float64", + "aim_species_gambcolu_arabiensis": object, + "aim_species_gambiae_coluzzii": object, + "aim_species": object, + }, + taxon_colors=_ag3.TAXON_COLORS, + cohorts_analysis=ag3_sim_fixture.config["DEFAULT_COHORTS_ANALYSIS"], + unrestricted_use_only=True, + surveillance_use_only=True, + ) + + @pytest.fixture def af1_sim_api(af1_sim_fixture): return AnophelesSampleMetadata( @@ -48,6 +115,46 @@ def af1_sim_api(af1_sim_fixture): ) +@pytest.fixture +def af1_sim_unrestricted_use_only_api(af1_sim_fixture): + return AnophelesSampleMetadata( + url=af1_sim_fixture.url, + config_path=_af1.CONFIG_PATH, + major_version_number=_af1.MAJOR_VERSION_NUMBER, + major_version_path=_af1.MAJOR_VERSION_PATH, + pre=False, + taxon_colors=_af1.TAXON_COLORS, + unrestricted_use_only=True, + ) + + +@pytest.fixture +def af1_sim_surveillance_use_only_api(af1_sim_fixture): + return AnophelesSampleMetadata( + url=af1_sim_fixture.url, + config_path=_af1.CONFIG_PATH, + major_version_number=_af1.MAJOR_VERSION_NUMBER, + major_version_path=_af1.MAJOR_VERSION_PATH, + pre=False, + taxon_colors=_af1.TAXON_COLORS, + surveillance_use_only=True, + ) + + +@pytest.fixture +def af1_sim_unrestricted_surveillance_use_only_api(af1_sim_fixture): + return AnophelesSampleMetadata( + url=af1_sim_fixture.url, + config_path=_af1.CONFIG_PATH, + major_version_number=_af1.MAJOR_VERSION_NUMBER, + major_version_path=_af1.MAJOR_VERSION_PATH, + pre=False, + taxon_colors=_af1.TAXON_COLORS, + unrestricted_use_only=True, + surveillance_use_only=True, + ) + + @pytest.fixture def missing_metadata_api(fixture_dir): # In this fixture, one of the sample sets (AG1000G-BF-A) has missing files @@ -69,14 +176,58 @@ def missing_metadata_api(fixture_dir): ) +@case def case_ag3_sim(ag3_sim_fixture, ag3_sim_api): return ag3_sim_fixture, ag3_sim_api +@case def case_af1_sim(af1_sim_fixture, af1_sim_api): return af1_sim_fixture, af1_sim_api +@case +def case_ag3_sim_unrestricted_use_only( + ag3_sim_fixture, ag3_sim_unrestricted_use_only_api +): + return ag3_sim_fixture, ag3_sim_unrestricted_use_only_api + + +@case +def case_af1_sim_unrestricted_use_only( + af1_sim_fixture, af1_sim_unrestricted_use_only_api +): + return af1_sim_fixture, af1_sim_unrestricted_use_only_api + + +@case +def case_ag3_sim_surveillance_use_only( + ag3_sim_fixture, ag3_sim_surveillance_use_only_api +): + return ag3_sim_fixture, ag3_sim_surveillance_use_only_api + + +@case +def case_af1_sim_surveillance_use_only( + af1_sim_fixture, af1_sim_surveillance_use_only_api +): + return af1_sim_fixture, af1_sim_surveillance_use_only_api + + +@case +def case_ag3_sim_unrestricted_surveillance_use_only( + ag3_sim_fixture, ag3_sim_unrestricted_surveillance_use_only_api +): + return ag3_sim_fixture, ag3_sim_unrestricted_surveillance_use_only_api + + +@case +def case_af1_sim_unrestricted_surveillance_use_only( + af1_sim_fixture, af1_sim_unrestricted_surveillance_use_only_api +): + return af1_sim_fixture, af1_sim_unrestricted_surveillance_use_only_api + + def general_metadata_expected_columns(): return { "sample_id": "O", @@ -117,6 +268,11 @@ def test_general_metadata_with_single_sample_set(fixture, api: AnophelesSampleMe df_sample_sets = api.sample_sets().set_index("sample_set") sample_count = df_sample_sets["sample_count"] all_sample_sets = df_sample_sets.index.to_list() + + # FIXME: we should probably add more sample sets to the fixtures to test combinations of unrestricted_use_only and surveillance_use_only. + if len(all_sample_sets) == 0: + pytest.skip("Skipping because there are no relevant sample sets to test.") + sample_set = random.choice(all_sample_sets) # Call function to be tested.