diff --git a/malariagen_data/af1.py b/malariagen_data/af1.py
index bc0a6141..84594430 100644
--- a/malariagen_data/af1.py
+++ b/malariagen_data/af1.py
@@ -89,6 +89,8 @@ def __init__(
discordant_read_calls_analysis=None,
pre=False,
tqdm_class=None,
+ unrestricted_use_only=False,
+ surveillance_use_only=False,
**storage_options, # used by fsspec via init_filesystem()
):
super().__init__(
@@ -124,6 +126,8 @@ def __init__(
virtual_contigs=None,
gene_names=None,
inversion_tag_path=None,
+ unrestricted_use_only=unrestricted_use_only,
+ surveillance_use_only=surveillance_use_only,
)
def __repr__(self):
@@ -136,6 +140,8 @@ def __repr__(self):
f"Site filters analysis : {self._site_filters_analysis}\n"
f"Software version : malariagen_data {malariagen_data.__version__}\n"
f"Client location : {self.client_location}\n"
+ f"Data filtered to unrestricted use only: {self._unrestricted_use_only}\n"
+ f"Data filtered to surveillance use only: {self._surveillance_use_only}\n"
f"---\n"
f"Please note that data are subject to terms of use,\n"
f"for more information see https://www.malariagen.net/data\n"
@@ -201,6 +207,18 @@ def _repr_html_(self):
{self.client_location} |
+
+
+ Data filtered for unrestricted use only
+ |
+ {self._unrestricted_use_only} |
+
+
+
+ Data filtered for surveillance use only
+ |
+ {self._surveillance_use_only} |
+
"""
diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py
index 443c594b..3dfdb4f9 100644
--- a/malariagen_data/ag3.py
+++ b/malariagen_data/ag3.py
@@ -150,6 +150,8 @@ def __init__(
discordant_read_calls_analysis=None,
pre=False,
tqdm_class=None,
+ unrestricted_use_only=False,
+ surveillance_use_only=False,
**storage_options, # used by fsspec via init_filesystem()
):
super().__init__(
@@ -192,6 +194,8 @@ def __init__(
virtual_contigs=VIRTUAL_CONTIGS,
gene_names=GENE_NAMES,
inversion_tag_path=INVERSION_TAG_PATH,
+ unrestricted_use_only=unrestricted_use_only,
+ surveillance_use_only=surveillance_use_only,
)
# set up caches
@@ -218,6 +222,8 @@ def __repr__(self):
f"Site filters analysis : {self._site_filters_analysis}\n"
f"Software version : malariagen_data {malariagen_data.__version__}\n"
f"Client location : {self.client_location}\n"
+ f"Data filtered to unrestricted use only: {self._unrestricted_use_only}\n"
+ f"Data filtered to surveillance use only: {self._surveillance_use_only}\n"
f"---\n"
f"Please note that data are subject to terms of use,\n"
f"for more information see https://www.malariagen.net/data\n"
@@ -289,6 +295,18 @@ def _repr_html_(self):
{self.client_location} |
+
+
+ Data filtered for unrestricted use only
+ |
+ {self._unrestricted_use_only} |
+
+
+
+ Data filtered for surveillance use only
+ |
+ {self._surveillance_use_only} |
+
"""
diff --git a/malariagen_data/anoph/base.py b/malariagen_data/anoph/base.py
index a214e6fb..3cbaf75b 100644
--- a/malariagen_data/anoph/base.py
+++ b/malariagen_data/anoph/base.py
@@ -61,6 +61,8 @@ def __init__(
storage_options: Optional[Mapping] = None,
results_cache: Optional[str] = None,
tqdm_class=None,
+ unrestricted_use_only: Optional[bool] = False,
+ surveillance_use_only: Optional[bool] = False,
):
# If show_progress has not been specified, then determine the default.
if show_progress is None:
@@ -85,6 +87,8 @@ def __init__(
if tqdm_class is None:
tqdm_class = tqdm_auto
self._tqdm_class = tqdm_class
+ self._unrestricted_use_only = unrestricted_use_only
+ self._surveillance_use_only = surveillance_use_only
# Set up logging.
self._log = LoggingHelper(name=__name__, out=log, debug=debug)
@@ -406,6 +410,7 @@ def _read_sample_sets(self, *, single_release: str):
`terms_of_use_url` is the URL of the terms of use,
`release` is the identifier of the release containing the sample set,
`unrestricted_use` whether the sample set can be without restriction (e.g., if the terms of use have expired).
+ If `unrestricted_use_only` is set to `True` then only sample sets with `unrestricted_use` set to `True` will be included.
""",
)
def sample_sets(
@@ -428,6 +433,11 @@ def sample_sets(
except KeyError:
# Read and cache dataframe for performance.
df = self._read_sample_sets(single_release=release)
+
+ # If unrestricted_use_only, restrict to sample sets with unrestricted_use.
+ if "unrestricted_use" in df.columns and self._unrestricted_use_only:
+ df = df[df["unrestricted_use"].astype(bool)]
+
self._cache_sample_sets[release] = df
elif isinstance(release, Sequence):
diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
index 4a2f63df..07856e62 100644
--- a/malariagen_data/anopheles.py
+++ b/malariagen_data/anopheles.py
@@ -141,6 +141,8 @@ def __init__(
virtual_contigs: Optional[Mapping[str, Sequence[str]]],
gene_names: Optional[Mapping[str, str]],
inversion_tag_path: Optional[str],
+ unrestricted_use_only: Optional[bool],
+ surveillance_use_only: Optional[bool],
):
super().__init__(
url=url,
@@ -175,6 +177,8 @@ def __init__(
virtual_contigs=virtual_contigs,
gene_names=gene_names,
inversion_tag_path=inversion_tag_path,
+ unrestricted_use_only=unrestricted_use_only,
+ surveillance_use_only=surveillance_use_only,
)
@property
diff --git a/tests/anoph/test_sample_metadata.py b/tests/anoph/test_sample_metadata.py
index e5b8ec8e..46e84a8b 100644
--- a/tests/anoph/test_sample_metadata.py
+++ b/tests/anoph/test_sample_metadata.py
@@ -7,7 +7,7 @@
import plotly.graph_objects as go # type: ignore
import pytest
from pandas.testing import assert_frame_equal
-from pytest_cases import parametrize_with_cases
+from pytest_cases import parametrize_with_cases, case
from typeguard import suppress_type_checks
from malariagen_data import af1 as _af1
@@ -36,6 +36,73 @@ def ag3_sim_api(ag3_sim_fixture):
)
+@pytest.fixture
+def ag3_sim_unrestricted_use_only_api(ag3_sim_fixture):
+ return AnophelesSampleMetadata(
+ url=ag3_sim_fixture.url,
+ config_path=_ag3.CONFIG_PATH,
+ major_version_number=_ag3.MAJOR_VERSION_NUMBER,
+ major_version_path=_ag3.MAJOR_VERSION_PATH,
+ pre=True,
+ aim_metadata_dtype={
+ "aim_species_fraction_arab": "float64",
+ "aim_species_fraction_colu": "float64",
+ "aim_species_fraction_colu_no2l": "float64",
+ "aim_species_gambcolu_arabiensis": object,
+ "aim_species_gambiae_coluzzii": object,
+ "aim_species": object,
+ },
+ taxon_colors=_ag3.TAXON_COLORS,
+ cohorts_analysis=ag3_sim_fixture.config["DEFAULT_COHORTS_ANALYSIS"],
+ unrestricted_use_only=True,
+ )
+
+
+@pytest.fixture
+def ag3_sim_surveillance_use_only_api(ag3_sim_fixture):
+ return AnophelesSampleMetadata(
+ url=ag3_sim_fixture.url,
+ config_path=_ag3.CONFIG_PATH,
+ major_version_number=_ag3.MAJOR_VERSION_NUMBER,
+ major_version_path=_ag3.MAJOR_VERSION_PATH,
+ pre=True,
+ aim_metadata_dtype={
+ "aim_species_fraction_arab": "float64",
+ "aim_species_fraction_colu": "float64",
+ "aim_species_fraction_colu_no2l": "float64",
+ "aim_species_gambcolu_arabiensis": object,
+ "aim_species_gambiae_coluzzii": object,
+ "aim_species": object,
+ },
+ taxon_colors=_ag3.TAXON_COLORS,
+ cohorts_analysis=ag3_sim_fixture.config["DEFAULT_COHORTS_ANALYSIS"],
+ surveillance_use_only=True,
+ )
+
+
+@pytest.fixture
+def ag3_sim_unrestricted_surveillance_use_only_api(ag3_sim_fixture):
+ return AnophelesSampleMetadata(
+ url=ag3_sim_fixture.url,
+ config_path=_ag3.CONFIG_PATH,
+ major_version_number=_ag3.MAJOR_VERSION_NUMBER,
+ major_version_path=_ag3.MAJOR_VERSION_PATH,
+ pre=True,
+ aim_metadata_dtype={
+ "aim_species_fraction_arab": "float64",
+ "aim_species_fraction_colu": "float64",
+ "aim_species_fraction_colu_no2l": "float64",
+ "aim_species_gambcolu_arabiensis": object,
+ "aim_species_gambiae_coluzzii": object,
+ "aim_species": object,
+ },
+ taxon_colors=_ag3.TAXON_COLORS,
+ cohorts_analysis=ag3_sim_fixture.config["DEFAULT_COHORTS_ANALYSIS"],
+ unrestricted_use_only=True,
+ surveillance_use_only=True,
+ )
+
+
@pytest.fixture
def af1_sim_api(af1_sim_fixture):
return AnophelesSampleMetadata(
@@ -48,6 +115,46 @@ def af1_sim_api(af1_sim_fixture):
)
+@pytest.fixture
+def af1_sim_unrestricted_use_only_api(af1_sim_fixture):
+ return AnophelesSampleMetadata(
+ url=af1_sim_fixture.url,
+ config_path=_af1.CONFIG_PATH,
+ major_version_number=_af1.MAJOR_VERSION_NUMBER,
+ major_version_path=_af1.MAJOR_VERSION_PATH,
+ pre=False,
+ taxon_colors=_af1.TAXON_COLORS,
+ unrestricted_use_only=True,
+ )
+
+
+@pytest.fixture
+def af1_sim_surveillance_use_only_api(af1_sim_fixture):
+ return AnophelesSampleMetadata(
+ url=af1_sim_fixture.url,
+ config_path=_af1.CONFIG_PATH,
+ major_version_number=_af1.MAJOR_VERSION_NUMBER,
+ major_version_path=_af1.MAJOR_VERSION_PATH,
+ pre=False,
+ taxon_colors=_af1.TAXON_COLORS,
+ surveillance_use_only=True,
+ )
+
+
+@pytest.fixture
+def af1_sim_unrestricted_surveillance_use_only_api(af1_sim_fixture):
+ return AnophelesSampleMetadata(
+ url=af1_sim_fixture.url,
+ config_path=_af1.CONFIG_PATH,
+ major_version_number=_af1.MAJOR_VERSION_NUMBER,
+ major_version_path=_af1.MAJOR_VERSION_PATH,
+ pre=False,
+ taxon_colors=_af1.TAXON_COLORS,
+ unrestricted_use_only=True,
+ surveillance_use_only=True,
+ )
+
+
@pytest.fixture
def missing_metadata_api(fixture_dir):
# In this fixture, one of the sample sets (AG1000G-BF-A) has missing files
@@ -69,14 +176,58 @@ def missing_metadata_api(fixture_dir):
)
+@case
def case_ag3_sim(ag3_sim_fixture, ag3_sim_api):
return ag3_sim_fixture, ag3_sim_api
+@case
def case_af1_sim(af1_sim_fixture, af1_sim_api):
return af1_sim_fixture, af1_sim_api
+@case
+def case_ag3_sim_unrestricted_use_only(
+ ag3_sim_fixture, ag3_sim_unrestricted_use_only_api
+):
+ return ag3_sim_fixture, ag3_sim_unrestricted_use_only_api
+
+
+@case
+def case_af1_sim_unrestricted_use_only(
+ af1_sim_fixture, af1_sim_unrestricted_use_only_api
+):
+ return af1_sim_fixture, af1_sim_unrestricted_use_only_api
+
+
+@case
+def case_ag3_sim_surveillance_use_only(
+ ag3_sim_fixture, ag3_sim_surveillance_use_only_api
+):
+ return ag3_sim_fixture, ag3_sim_surveillance_use_only_api
+
+
+@case
+def case_af1_sim_surveillance_use_only(
+ af1_sim_fixture, af1_sim_surveillance_use_only_api
+):
+ return af1_sim_fixture, af1_sim_surveillance_use_only_api
+
+
+@case
+def case_ag3_sim_unrestricted_surveillance_use_only(
+ ag3_sim_fixture, ag3_sim_unrestricted_surveillance_use_only_api
+):
+ return ag3_sim_fixture, ag3_sim_unrestricted_surveillance_use_only_api
+
+
+@case
+def case_af1_sim_unrestricted_surveillance_use_only(
+ af1_sim_fixture, af1_sim_unrestricted_surveillance_use_only_api
+):
+ return af1_sim_fixture, af1_sim_unrestricted_surveillance_use_only_api
+
+
def general_metadata_expected_columns():
return {
"sample_id": "O",
@@ -117,6 +268,11 @@ def test_general_metadata_with_single_sample_set(fixture, api: AnophelesSampleMe
df_sample_sets = api.sample_sets().set_index("sample_set")
sample_count = df_sample_sets["sample_count"]
all_sample_sets = df_sample_sets.index.to_list()
+
+ # FIXME: we should probably add more sample sets to the fixtures to test combinations of unrestricted_use_only and surveillance_use_only.
+ if len(all_sample_sets) == 0:
+ pytest.skip("Skipping because there are no relevant sample sets to test.")
+
sample_set = random.choice(all_sample_sets)
# Call function to be tested.