From cf283ee9044a44d1e66f0db72486ae457ef53819 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marta=20Arcones=20Rodr=C3=ADguez?=
 <marta.arconesrodriguez@zooplus.com>
Date: Sun, 5 Nov 2023 21:07:06 +0100
Subject: [PATCH] WIP

---
 .github/workflows/build-and-test.yml |  2 +-
 README.md                            |  5 +--
 kilombo/model/SeveralSrpsFound.py    |  2 +
 kilombo/model/failed_study_reason.py |  3 +-
 kilombo/model/study_hierarchy.py     | 11 ++++--
 kilombo/service/external/ncbi.py     | 57 +++++++++++++++++++---------
 6 files changed, 54 insertions(+), 26 deletions(-)
 create mode 100644 kilombo/model/SeveralSrpsFound.py

diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
index 0257ac2..cdfd2cf 100644
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@@ -6,7 +6,7 @@ jobs:
     steps:
       - run: curl -sSL https://install.python-poetry.org | python3 -
       - name: Check out repository code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
       - run: poetry env use 3.10.12 && poetry env info
       - run: poetry install
       - run: cd tests && poetry run pytest
diff --git a/README.md b/README.md
index 7243653..5cb88c4 100644
--- a/README.md
+++ b/README.md
@@ -17,8 +17,5 @@ Get installed [poetry](https://python-poetry.org/) and then:
 Server will listen in port 8000. You can check the functionality with this example query:
 
 ```shell
-    curl --location 'localhost:8000/query-study-hierarchy?keyword=stroke%20AND%20single%20cell%20rna%20seq%20AND%20musculus'
+    curl 'localhost:8000/query-study-hierarchy?keyword=stroke%20AND%20single%20cell%20rna%20seq%20AND%20musculus'
 ```
-
-
-TODO: check before and after!!!
diff --git a/kilombo/model/SeveralSrpsFound.py b/kilombo/model/SeveralSrpsFound.py
new file mode 100644
index 0000000..b12830f
--- /dev/null
+++ b/kilombo/model/SeveralSrpsFound.py
@@ -0,0 +1,2 @@
+class SeveralSrpsFound(Exception):
+    pass
diff --git a/kilombo/model/failed_study_reason.py b/kilombo/model/failed_study_reason.py
index 7a5eb00..afdfeae 100644
--- a/kilombo/model/failed_study_reason.py
+++ b/kilombo/model/failed_study_reason.py
@@ -3,4 +3,5 @@
 
 
 class FailedStudyReason(Enum):
-    SRP_NOT_FOUND = "Not found SRP"
+    SRP_NOT_FOUND_FOR_STUDY = "Not found SRP for this study"
+    SEVERAL_SRPS_FOR_ONE_STUDY = "Several SRPs where found for this study"
diff --git a/kilombo/model/study_hierarchy.py b/kilombo/model/study_hierarchy.py
index 23a7e6e..e3b3a3a 100644
--- a/kilombo/model/study_hierarchy.py
+++ b/kilombo/model/study_hierarchy.py
@@ -42,6 +42,10 @@ def add_srrs(self, study_id, srrs: []):
         self.successful[study_id]["srrs"] = srrs
 
     def reconcile(self):
+        self._clean_pending_studies_already_processed()
+        self._order_studies()
+
+    def _clean_pending_studies_already_processed(self):
         successful_study_ids_to_remove_from_pending = [study_id[0] for study_id in self.successful.items() if study_id[0] in self.pending.keys()]
         failed_study_ids_to_remove_from_pending = [study_id[0] for study_id in self.failed.items() if study_id[0] in self.pending.keys()]
 
@@ -53,9 +57,10 @@ def reconcile(self):
         if len(self.pending) == 0:
             del self.pending
 
-        if self.pending:
+    def _order_studies(self):
+        if hasattr(self, "pending"):
             self.pending = dict(sorted(self.pending.items()))
-        if self.successful:
+        if hasattr(self, "successful"):
             self.successful = dict(sorted(self.successful.items()))
-        if self.failed:
+        if hasattr(self, "failed"):
             self.failed = dict(sorted(self.failed.items()))
diff --git a/kilombo/service/external/ncbi.py b/kilombo/service/external/ncbi.py
index f03647f..c3d2368 100644
--- a/kilombo/service/external/ncbi.py
+++ b/kilombo/service/external/ncbi.py
@@ -9,6 +9,7 @@
 
 from kilombo.model.failed_study import FailedStudy
 from kilombo.model.failed_study_reason import FailedStudyReason
+from kilombo.model.SeveralSrpsFound import SeveralSrpsFound
 from kilombo.model.study_hierarchy import StudyHierarchy
 
 NCBI_API_KEYS = ["ed06bd0f3c27a605d87e51e94eecab115908", "b81884ffa1519f17cae15f6bd21ac8070108"]
@@ -79,9 +80,9 @@ def _extract_srp_from_study_summaries(study_summary: dict):
 
 def _fetch_study_list(keyword: str):
     url = f"{NCBI_ESEARCH_BASE_URL}?db=gds&retmode=json&retmax=10000&term={keyword}"
-    logging.debug(f"HTTP GET started ==> {url}")
+    logging.debug(f"HTTP GET Started ==> {url}")
     response = requests.get(url)
-    logging.debug(f"HTTP GET done ==> {url}")
+    logging.debug(f"HTTP GET Done ==> {url}")
     return response
 
 
@@ -93,7 +94,7 @@ async def _fetch_study_summaries(study_id: int):
         api_key = NCBI_API_KEYS[0] if retries_count % 2 == 0 else NCBI_API_KEYS[1]
         url = unauthenticated_url + f"&api_key={api_key}"
         async with aiohttp.ClientSession() as session:
-            logging.debug(f"HTTP GET started ==> {url}")
+            logging.debug(f"HTTP GET Started ==> {url}")
             async with session.get(url) as response:
                 logging.debug(f"HTTP GET Done ==> {url}")
                 if response.status == 200:
@@ -107,40 +108,62 @@ async def _fetch_study_summaries(study_id: int):
 
 async def link_study_and_accessions_alternative(study_hierarchy: StudyHierarchy):
     for study_id in study_hierarchy.pending:
-        webenv_gse = _post_esearch_for_term("gds", study_hierarchy.pending[study_id]["GSE"])
+        gse = study_hierarchy.pending[study_id]["GSE"]
+        logging.info(f"Trying to get SRP for {gse} by alternative method")
+        webenv_gse = _post_esearch_for_term("gds", gse)
         srxs_for_gse = _fetch_gse_srxs(webenv_gse)
-        if srxs_for_gse:
-            webenv_srxs = _post_esearch_for_term("sra", " OR ".join(srxs_for_gse))
-            srp = _extract_srp_from_srx_summaries(webenv_srxs)
-            study_hierarchy.move_study_to_successful(study_id, srp)
-        else:
-            study_hierarchy.move_study_to_failed(FailedStudy(study_id, FailedStudyReason.SRP_NOT_FOUND))
+        try:
+            if srxs_for_gse:
+                webenv_srxs = _post_esearch_for_term("sra", " OR ".join(srxs_for_gse))
+                srp = _extract_srp_from_srx_summaries(webenv_srxs)
+                study_hierarchy.move_study_to_successful(study_id, srp)
+                logging.info(f"Successfully matched {gse} with {srp} by alternative method")
+            else:
+                logging.warning(f"Not found SRP for {gse}")
+                study_hierarchy.move_study_to_failed(FailedStudy(study_id, FailedStudyReason.SRP_NOT_FOUND_FOR_STUDY))
+        except SeveralSrpsFound:
+            logging.warning(f"Several SRPs found for {gse}")
+            study_hierarchy.move_study_to_failed(FailedStudy(study_id, FailedStudyReason.SEVERAL_SRPS_FOR_ONE_STUDY))
     study_hierarchy.reconcile()
 
 
 def _post_esearch_for_term(db: str, term: str):
     payload = {"db": db, "usehistory": "n", "retmode": "json", "term": term}
+    logging.debug(f"HTTP POST Started ==> {NCBI_ESEARCH_BASE_URL} with params {payload}")
     response = json.loads(requests.post(NCBI_ESEARCH_BASE_URL, data=payload).text)
+    logging.debug(f"HTTP POST Done ==> {NCBI_ESEARCH_BASE_URL} with params {payload}")
     return response["esearchresult"]["webenv"]
 
 
 def _extract_srp_from_srx_summaries(webenv: str):
     url_with_webenv = f"{NCBI_ESUMMARY_BASE_URL}?db=gds&retmode=json&WebEnv={webenv}&query_key=1"
+    logging.debug(f"HTTP GET Started ==> {url_with_webenv}")
     response = json.loads(requests.get(url_with_webenv).text)
+    logging.debug(f"HTTP GET Done ==> {url_with_webenv}")
     id_list = response["result"]["uids"]
     root_node = "root_to_avoid_parse_error"
     exp_xmls = [f"<{root_node}>{response['result'][id]['expxml']}</{root_node}>" for id in id_list]
     exp_xmls_parsed = [xmltodict.parse(exp_xml) for exp_xml in exp_xmls]
     srps = [xml[root_node]["Study"]["@acc"] for xml in exp_xmls_parsed]
-    assert len(set(srps)) == 1
+    if len(set(srps)) != 1:
+        raise SeveralSrpsFound()
     return srps[0]
 
 
 def _fetch_gse_srxs(webenv: str):
-    url_with_webenv = f"{NCBI_ESUMMARY_BASE_URL}?db=gds&retmode=json&WebEnv={webenv}&query_key=1"
-    response = json.loads(requests.get(url_with_webenv).text)
-    id_list = response["result"]["uids"]
-    extrelations = [response["result"][id]["extrelations"] for id in id_list]
-    extrelations = list(filter(lambda extrelation: extrelation, extrelations))
-    srxs = [extrelation[0]["targetobject"] for extrelation in extrelations]
+    retstart = 0
+    batch_size = 500
+    srxs = []
+    while True:
+        url_with_webenv = f"{NCBI_ESUMMARY_BASE_URL}?db=gds&retmode=json&WebEnv={webenv}&query_key=1&retstart={retstart}&retmax={batch_size}&usehistory=y"
+        logging.debug(f"HTTP GET Started ==> {url_with_webenv}")
+        response = json.loads(requests.get(url_with_webenv).text)
+        logging.debug(f"HTTP GET Done ==> {url_with_webenv}")
+        uids = response["result"]["uids"]
+        extrelations = [response["result"][uid]["extrelations"] for uid in uids]
+        extrelations = list(filter(lambda extrelation: extrelation, extrelations))
+        srxs = srxs + [extrelation[0]["targetobject"] for extrelation in extrelations]
+        retstart += batch_size
+        if len(uids) < batch_size:
+            break
     return srxs