From cf283ee9044a44d1e66f0db72486ae457ef53819 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marta=20Arcones=20Rodr=C3=ADguez?= Date: Sun, 5 Nov 2023 21:07:06 +0100 Subject: [PATCH] WIP --- .github/workflows/build-and-test.yml | 2 +- README.md | 5 +-- kilombo/model/SeveralSrpsFound.py | 2 + kilombo/model/failed_study_reason.py | 3 +- kilombo/model/study_hierarchy.py | 11 ++++-- kilombo/service/external/ncbi.py | 57 +++++++++++++++++++--------- 6 files changed, 54 insertions(+), 26 deletions(-) create mode 100644 kilombo/model/SeveralSrpsFound.py diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 0257ac2..cdfd2cf 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -6,7 +6,7 @@ jobs: steps: - run: curl -sSL https://install.python-poetry.org | python3 - - name: Check out repository code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - run: poetry env use 3.10.12 && poetry env info - run: poetry install - run: cd tests && poetry run pytest diff --git a/README.md b/README.md index 7243653..5cb88c4 100644 --- a/README.md +++ b/README.md @@ -17,8 +17,5 @@ Get installed [poetry](https://python-poetry.org/) and then: Server will listen in port 8000. You can check the functionality with this example query: ```shell - curl --location 'localhost:8000/query-study-hierarchy?keyword=stroke%20AND%20single%20cell%20rna%20seq%20AND%20musculus' + curl 'localhost:8000/query-study-hierarchy?keyword=stroke%20AND%20single%20cell%20rna%20seq%20AND%20musculus' ``` - - -TODO: check before and after!!! diff --git a/kilombo/model/SeveralSrpsFound.py b/kilombo/model/SeveralSrpsFound.py new file mode 100644 index 0000000..b12830f --- /dev/null +++ b/kilombo/model/SeveralSrpsFound.py @@ -0,0 +1,2 @@ +class SeveralSrpsFound(Exception): + pass diff --git a/kilombo/model/failed_study_reason.py b/kilombo/model/failed_study_reason.py index 7a5eb00..afdfeae 100644 --- a/kilombo/model/failed_study_reason.py +++ b/kilombo/model/failed_study_reason.py @@ -3,4 +3,5 @@ class FailedStudyReason(Enum): - SRP_NOT_FOUND = "Not found SRP" + SRP_NOT_FOUND_FOR_STUDY = "Not found SRP for this study" + SEVERAL_SRPS_FOR_ONE_STUDY = "Several SRPs where found for this study" diff --git a/kilombo/model/study_hierarchy.py b/kilombo/model/study_hierarchy.py index 23a7e6e..e3b3a3a 100644 --- a/kilombo/model/study_hierarchy.py +++ b/kilombo/model/study_hierarchy.py @@ -42,6 +42,10 @@ def add_srrs(self, study_id, srrs: []): self.successful[study_id]["srrs"] = srrs def reconcile(self): + self._clean_pending_studies_already_processed() + self._order_studies() + + def _clean_pending_studies_already_processed(self): successful_study_ids_to_remove_from_pending = [study_id[0] for study_id in self.successful.items() if study_id[0] in self.pending.keys()] failed_study_ids_to_remove_from_pending = [study_id[0] for study_id in self.failed.items() if study_id[0] in self.pending.keys()] @@ -53,9 +57,10 @@ def reconcile(self): if len(self.pending) == 0: del self.pending - if self.pending: + def _order_studies(self): + if hasattr(self, "pending"): self.pending = dict(sorted(self.pending.items())) - if self.successful: + if hasattr(self, "successful"): self.successful = dict(sorted(self.successful.items())) - if self.failed: + if hasattr(self, "failed"): self.failed = dict(sorted(self.failed.items())) diff --git a/kilombo/service/external/ncbi.py b/kilombo/service/external/ncbi.py index f03647f..c3d2368 100644 --- a/kilombo/service/external/ncbi.py +++ b/kilombo/service/external/ncbi.py @@ -9,6 +9,7 @@ from kilombo.model.failed_study import FailedStudy from kilombo.model.failed_study_reason import FailedStudyReason +from kilombo.model.SeveralSrpsFound import SeveralSrpsFound from kilombo.model.study_hierarchy import StudyHierarchy NCBI_API_KEYS = ["ed06bd0f3c27a605d87e51e94eecab115908", "b81884ffa1519f17cae15f6bd21ac8070108"] @@ -79,9 +80,9 @@ def _extract_srp_from_study_summaries(study_summary: dict): def _fetch_study_list(keyword: str): url = f"{NCBI_ESEARCH_BASE_URL}?db=gds&retmode=json&retmax=10000&term={keyword}" - logging.debug(f"HTTP GET started ==> {url}") + logging.debug(f"HTTP GET Started ==> {url}") response = requests.get(url) - logging.debug(f"HTTP GET done ==> {url}") + logging.debug(f"HTTP GET Done ==> {url}") return response @@ -93,7 +94,7 @@ async def _fetch_study_summaries(study_id: int): api_key = NCBI_API_KEYS[0] if retries_count % 2 == 0 else NCBI_API_KEYS[1] url = unauthenticated_url + f"&api_key={api_key}" async with aiohttp.ClientSession() as session: - logging.debug(f"HTTP GET started ==> {url}") + logging.debug(f"HTTP GET Started ==> {url}") async with session.get(url) as response: logging.debug(f"HTTP GET Done ==> {url}") if response.status == 200: @@ -107,40 +108,62 @@ async def _fetch_study_summaries(study_id: int): async def link_study_and_accessions_alternative(study_hierarchy: StudyHierarchy): for study_id in study_hierarchy.pending: - webenv_gse = _post_esearch_for_term("gds", study_hierarchy.pending[study_id]["GSE"]) + gse = study_hierarchy.pending[study_id]["GSE"] + logging.info(f"Trying to get SRP for {gse} by alternative method") + webenv_gse = _post_esearch_for_term("gds", gse) srxs_for_gse = _fetch_gse_srxs(webenv_gse) - if srxs_for_gse: - webenv_srxs = _post_esearch_for_term("sra", " OR ".join(srxs_for_gse)) - srp = _extract_srp_from_srx_summaries(webenv_srxs) - study_hierarchy.move_study_to_successful(study_id, srp) - else: - study_hierarchy.move_study_to_failed(FailedStudy(study_id, FailedStudyReason.SRP_NOT_FOUND)) + try: + if srxs_for_gse: + webenv_srxs = _post_esearch_for_term("sra", " OR ".join(srxs_for_gse)) + srp = _extract_srp_from_srx_summaries(webenv_srxs) + study_hierarchy.move_study_to_successful(study_id, srp) + logging.info(f"Successfully matched {gse} with {srp} by alternative method") + else: + logging.warning(f"Not found SRP for {gse}") + study_hierarchy.move_study_to_failed(FailedStudy(study_id, FailedStudyReason.SRP_NOT_FOUND_FOR_STUDY)) + except SeveralSrpsFound: + logging.warning(f"Several SRPs found for {gse}") + study_hierarchy.move_study_to_failed(FailedStudy(study_id, FailedStudyReason.SEVERAL_SRPS_FOR_ONE_STUDY)) study_hierarchy.reconcile() def _post_esearch_for_term(db: str, term: str): payload = {"db": db, "usehistory": "n", "retmode": "json", "term": term} + logging.debug(f"HTTP POST Started ==> {NCBI_ESEARCH_BASE_URL} with params {payload}") response = json.loads(requests.post(NCBI_ESEARCH_BASE_URL, data=payload).text) + logging.debug(f"HTTP POST Done ==> {NCBI_ESEARCH_BASE_URL} with params {payload}") return response["esearchresult"]["webenv"] def _extract_srp_from_srx_summaries(webenv: str): url_with_webenv = f"{NCBI_ESUMMARY_BASE_URL}?db=gds&retmode=json&WebEnv={webenv}&query_key=1" + logging.debug(f"HTTP GET Started ==> {url_with_webenv}") response = json.loads(requests.get(url_with_webenv).text) + logging.debug(f"HTTP GET Done ==> {url_with_webenv}") id_list = response["result"]["uids"] root_node = "root_to_avoid_parse_error" exp_xmls = [f"<{root_node}>{response['result'][id]['expxml']}" for id in id_list] exp_xmls_parsed = [xmltodict.parse(exp_xml) for exp_xml in exp_xmls] srps = [xml[root_node]["Study"]["@acc"] for xml in exp_xmls_parsed] - assert len(set(srps)) == 1 + if len(set(srps)) != 1: + raise SeveralSrpsFound() return srps[0] def _fetch_gse_srxs(webenv: str): - url_with_webenv = f"{NCBI_ESUMMARY_BASE_URL}?db=gds&retmode=json&WebEnv={webenv}&query_key=1" - response = json.loads(requests.get(url_with_webenv).text) - id_list = response["result"]["uids"] - extrelations = [response["result"][id]["extrelations"] for id in id_list] - extrelations = list(filter(lambda extrelation: extrelation, extrelations)) - srxs = [extrelation[0]["targetobject"] for extrelation in extrelations] + retstart = 0 + batch_size = 500 + srxs = [] + while True: + url_with_webenv = f"{NCBI_ESUMMARY_BASE_URL}?db=gds&retmode=json&WebEnv={webenv}&query_key=1&retstart={retstart}&retmax={batch_size}&usehistory=y" + logging.debug(f"HTTP GET Started ==> {url_with_webenv}") + response = json.loads(requests.get(url_with_webenv).text) + logging.debug(f"HTTP GET Done ==> {url_with_webenv}") + uids = response["result"]["uids"] + extrelations = [response["result"][uid]["extrelations"] for uid in uids] + extrelations = list(filter(lambda extrelation: extrelation, extrelations)) + srxs = srxs + [extrelation[0]["targetobject"] for extrelation in extrelations] + retstart += batch_size + if len(uids) < batch_size: + break return srxs