Skip to content

Commit

Permalink
WIP: dev support for unrestricted_use_only, surveillance_use_only
Browse files Browse the repository at this point in the history
  • Loading branch information
leehart committed Feb 7, 2025
1 parent ff10e60 commit fc6c2ba
Show file tree
Hide file tree
Showing 5 changed files with 207 additions and 1 deletion.
18 changes: 18 additions & 0 deletions malariagen_data/af1.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ def __init__(
discordant_read_calls_analysis=None,
pre=False,
tqdm_class=None,
unrestricted_use_only=False,
surveillance_use_only=False,
**storage_options, # used by fsspec via init_filesystem()
):
super().__init__(
Expand Down Expand Up @@ -124,6 +126,8 @@ def __init__(
virtual_contigs=None,
gene_names=None,
inversion_tag_path=None,
unrestricted_use_only=unrestricted_use_only,
surveillance_use_only=surveillance_use_only,
)

def __repr__(self):
Expand All @@ -136,6 +140,8 @@ def __repr__(self):
f"Site filters analysis : {self._site_filters_analysis}\n"
f"Software version : malariagen_data {malariagen_data.__version__}\n"
f"Client location : {self.client_location}\n"
f"Data filtered to unrestricted use only: {self._unrestricted_use_only}\n"
f"Data filtered to surveillance use only: {self._surveillance_use_only}\n"
f"---\n"
f"Please note that data are subject to terms of use,\n"
f"for more information see https://www.malariagen.net/data\n"
Expand Down Expand Up @@ -201,6 +207,18 @@ def _repr_html_(self):
</th>
<td>{self.client_location}</td>
</tr>
<tr>
<th style="text-align: left">
Data filtered for unrestricted use only
</th>
<td>{self._unrestricted_use_only}</td>
</tr>
<tr>
<th style="text-align: left">
Data filtered for surveillance use only
</th>
<td>{self._surveillance_use_only}</td>
</tr>
</tbody>
</table>
"""
Expand Down
18 changes: 18 additions & 0 deletions malariagen_data/ag3.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,8 @@ def __init__(
discordant_read_calls_analysis=None,
pre=False,
tqdm_class=None,
unrestricted_use_only=False,
surveillance_use_only=False,
**storage_options, # used by fsspec via init_filesystem()
):
super().__init__(
Expand Down Expand Up @@ -192,6 +194,8 @@ def __init__(
virtual_contigs=VIRTUAL_CONTIGS,
gene_names=GENE_NAMES,
inversion_tag_path=INVERSION_TAG_PATH,
unrestricted_use_only=unrestricted_use_only,
surveillance_use_only=surveillance_use_only,
)

# set up caches
Expand All @@ -218,6 +222,8 @@ def __repr__(self):
f"Site filters analysis : {self._site_filters_analysis}\n"
f"Software version : malariagen_data {malariagen_data.__version__}\n"
f"Client location : {self.client_location}\n"
f"Data filtered to unrestricted use only: {self._unrestricted_use_only}\n"
f"Data filtered to surveillance use only: {self._surveillance_use_only}\n"
f"---\n"
f"Please note that data are subject to terms of use,\n"
f"for more information see https://www.malariagen.net/data\n"
Expand Down Expand Up @@ -289,6 +295,18 @@ def _repr_html_(self):
</th>
<td>{self.client_location}</td>
</tr>
<tr>
<th style="text-align: left">
Data filtered for unrestricted use only
</th>
<td>{self._unrestricted_use_only}</td>
</tr>
<tr>
<th style="text-align: left">
Data filtered for surveillance use only
</th>
<td>{self._surveillance_use_only}</td>
</tr>
</tbody>
</table>
"""
Expand Down
10 changes: 10 additions & 0 deletions malariagen_data/anoph/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ def __init__(
storage_options: Optional[Mapping] = None,
results_cache: Optional[str] = None,
tqdm_class=None,
unrestricted_use_only: Optional[bool] = False,
surveillance_use_only: Optional[bool] = False,
):
# If show_progress has not been specified, then determine the default.
if show_progress is None:
Expand All @@ -85,6 +87,8 @@ def __init__(
if tqdm_class is None:
tqdm_class = tqdm_auto
self._tqdm_class = tqdm_class
self._unrestricted_use_only = unrestricted_use_only
self._surveillance_use_only = surveillance_use_only

# Set up logging.
self._log = LoggingHelper(name=__name__, out=log, debug=debug)
Expand Down Expand Up @@ -406,6 +410,7 @@ def _read_sample_sets(self, *, single_release: str):
`terms_of_use_url` is the URL of the terms of use,
`release` is the identifier of the release containing the sample set,
`unrestricted_use` whether the sample set can be without restriction (e.g., if the terms of use have expired).
If `unrestricted_use_only` is set to `True` then only sample sets with `unrestricted_use` set to `True` will be included.
""",
)
def sample_sets(
Expand All @@ -428,6 +433,11 @@ def sample_sets(
except KeyError:
# Read and cache dataframe for performance.
df = self._read_sample_sets(single_release=release)

# If unrestricted_use_only, restrict to sample sets with unrestricted_use.
if "unrestricted_use" in df.columns and self._unrestricted_use_only:
df = df[df["unrestricted_use"].astype(bool)]

self._cache_sample_sets[release] = df

elif isinstance(release, Sequence):
Expand Down
4 changes: 4 additions & 0 deletions malariagen_data/anopheles.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,8 @@ def __init__(
virtual_contigs: Optional[Mapping[str, Sequence[str]]],
gene_names: Optional[Mapping[str, str]],
inversion_tag_path: Optional[str],
unrestricted_use_only: Optional[bool],
surveillance_use_only: Optional[bool],
):
super().__init__(
url=url,
Expand Down Expand Up @@ -175,6 +177,8 @@ def __init__(
virtual_contigs=virtual_contigs,
gene_names=gene_names,
inversion_tag_path=inversion_tag_path,
unrestricted_use_only=unrestricted_use_only,
surveillance_use_only=surveillance_use_only,
)

@property
Expand Down
158 changes: 157 additions & 1 deletion tests/anoph/test_sample_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import plotly.graph_objects as go # type: ignore
import pytest
from pandas.testing import assert_frame_equal
from pytest_cases import parametrize_with_cases
from pytest_cases import parametrize_with_cases, case
from typeguard import suppress_type_checks

from malariagen_data import af1 as _af1
Expand Down Expand Up @@ -36,6 +36,73 @@ def ag3_sim_api(ag3_sim_fixture):
)


@pytest.fixture
def ag3_sim_unrestricted_use_only_api(ag3_sim_fixture):
return AnophelesSampleMetadata(
url=ag3_sim_fixture.url,
config_path=_ag3.CONFIG_PATH,
major_version_number=_ag3.MAJOR_VERSION_NUMBER,
major_version_path=_ag3.MAJOR_VERSION_PATH,
pre=True,
aim_metadata_dtype={
"aim_species_fraction_arab": "float64",
"aim_species_fraction_colu": "float64",
"aim_species_fraction_colu_no2l": "float64",
"aim_species_gambcolu_arabiensis": object,
"aim_species_gambiae_coluzzii": object,
"aim_species": object,
},
taxon_colors=_ag3.TAXON_COLORS,
cohorts_analysis=ag3_sim_fixture.config["DEFAULT_COHORTS_ANALYSIS"],
unrestricted_use_only=True,
)


@pytest.fixture
def ag3_sim_surveillance_use_only_api(ag3_sim_fixture):
return AnophelesSampleMetadata(
url=ag3_sim_fixture.url,
config_path=_ag3.CONFIG_PATH,
major_version_number=_ag3.MAJOR_VERSION_NUMBER,
major_version_path=_ag3.MAJOR_VERSION_PATH,
pre=True,
aim_metadata_dtype={
"aim_species_fraction_arab": "float64",
"aim_species_fraction_colu": "float64",
"aim_species_fraction_colu_no2l": "float64",
"aim_species_gambcolu_arabiensis": object,
"aim_species_gambiae_coluzzii": object,
"aim_species": object,
},
taxon_colors=_ag3.TAXON_COLORS,
cohorts_analysis=ag3_sim_fixture.config["DEFAULT_COHORTS_ANALYSIS"],
surveillance_use_only=True,
)


@pytest.fixture
def ag3_sim_unrestricted_surveillance_use_only_api(ag3_sim_fixture):
return AnophelesSampleMetadata(
url=ag3_sim_fixture.url,
config_path=_ag3.CONFIG_PATH,
major_version_number=_ag3.MAJOR_VERSION_NUMBER,
major_version_path=_ag3.MAJOR_VERSION_PATH,
pre=True,
aim_metadata_dtype={
"aim_species_fraction_arab": "float64",
"aim_species_fraction_colu": "float64",
"aim_species_fraction_colu_no2l": "float64",
"aim_species_gambcolu_arabiensis": object,
"aim_species_gambiae_coluzzii": object,
"aim_species": object,
},
taxon_colors=_ag3.TAXON_COLORS,
cohorts_analysis=ag3_sim_fixture.config["DEFAULT_COHORTS_ANALYSIS"],
unrestricted_use_only=True,
surveillance_use_only=True,
)


@pytest.fixture
def af1_sim_api(af1_sim_fixture):
return AnophelesSampleMetadata(
Expand All @@ -48,6 +115,46 @@ def af1_sim_api(af1_sim_fixture):
)


@pytest.fixture
def af1_sim_unrestricted_use_only_api(af1_sim_fixture):
return AnophelesSampleMetadata(
url=af1_sim_fixture.url,
config_path=_af1.CONFIG_PATH,
major_version_number=_af1.MAJOR_VERSION_NUMBER,
major_version_path=_af1.MAJOR_VERSION_PATH,
pre=False,
taxon_colors=_af1.TAXON_COLORS,
unrestricted_use_only=True,
)


@pytest.fixture
def af1_sim_surveillance_use_only_api(af1_sim_fixture):
return AnophelesSampleMetadata(
url=af1_sim_fixture.url,
config_path=_af1.CONFIG_PATH,
major_version_number=_af1.MAJOR_VERSION_NUMBER,
major_version_path=_af1.MAJOR_VERSION_PATH,
pre=False,
taxon_colors=_af1.TAXON_COLORS,
surveillance_use_only=True,
)


@pytest.fixture
def af1_sim_unrestricted_surveillance_use_only_api(af1_sim_fixture):
return AnophelesSampleMetadata(
url=af1_sim_fixture.url,
config_path=_af1.CONFIG_PATH,
major_version_number=_af1.MAJOR_VERSION_NUMBER,
major_version_path=_af1.MAJOR_VERSION_PATH,
pre=False,
taxon_colors=_af1.TAXON_COLORS,
unrestricted_use_only=True,
surveillance_use_only=True,
)


@pytest.fixture
def missing_metadata_api(fixture_dir):
# In this fixture, one of the sample sets (AG1000G-BF-A) has missing files
Expand All @@ -69,14 +176,58 @@ def missing_metadata_api(fixture_dir):
)


@case
def case_ag3_sim(ag3_sim_fixture, ag3_sim_api):
return ag3_sim_fixture, ag3_sim_api


@case
def case_af1_sim(af1_sim_fixture, af1_sim_api):
return af1_sim_fixture, af1_sim_api


@case
def case_ag3_sim_unrestricted_use_only(
ag3_sim_fixture, ag3_sim_unrestricted_use_only_api
):
return ag3_sim_fixture, ag3_sim_unrestricted_use_only_api


@case
def case_af1_sim_unrestricted_use_only(
af1_sim_fixture, af1_sim_unrestricted_use_only_api
):
return af1_sim_fixture, af1_sim_unrestricted_use_only_api


@case
def case_ag3_sim_surveillance_use_only(
ag3_sim_fixture, ag3_sim_surveillance_use_only_api
):
return ag3_sim_fixture, ag3_sim_surveillance_use_only_api


@case
def case_af1_sim_surveillance_use_only(
af1_sim_fixture, af1_sim_surveillance_use_only_api
):
return af1_sim_fixture, af1_sim_surveillance_use_only_api


@case
def case_ag3_sim_unrestricted_surveillance_use_only(
ag3_sim_fixture, ag3_sim_unrestricted_surveillance_use_only_api
):
return ag3_sim_fixture, ag3_sim_unrestricted_surveillance_use_only_api


@case
def case_af1_sim_unrestricted_surveillance_use_only(
af1_sim_fixture, af1_sim_unrestricted_surveillance_use_only_api
):
return af1_sim_fixture, af1_sim_unrestricted_surveillance_use_only_api


def general_metadata_expected_columns():
return {
"sample_id": "O",
Expand Down Expand Up @@ -117,6 +268,11 @@ def test_general_metadata_with_single_sample_set(fixture, api: AnophelesSampleMe
df_sample_sets = api.sample_sets().set_index("sample_set")
sample_count = df_sample_sets["sample_count"]
all_sample_sets = df_sample_sets.index.to_list()

# FIXME: we should probably add more sample sets to the fixtures to test combinations of unrestricted_use_only and surveillance_use_only.
if len(all_sample_sets) == 0:
pytest.skip("Skipping because there are no relevant sample sets to test.")

sample_set = random.choice(all_sample_sets)

# Call function to be tested.
Expand Down

0 comments on commit fc6c2ba

Please sign in to comment.