WIP

arcones · Nov 5, 2023 · cf283ee · cf283ee
1 parent 44fafbe
commit cf283ee
Show file tree

Hide file tree

Showing 6 changed files with 54 additions and 26 deletions.
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
@@ -6,7 +6,7 @@ jobs:
  steps:
  - run: curl -sSL https://install.python-poetry.org | python3 -
  - name: Check out repository code
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
  - run: poetry env use 3.10.12 && poetry env info
  - run: poetry install
  - run: cd tests && poetry run pytest
diff --git a/README.md b/README.md
@@ -17,8 +17,5 @@ Get installed [poetry](https://python-poetry.org/) and then:
 Server will listen in port 8000. You can check the functionality with this example query:
 
 ```shell
- curl --location 'localhost:8000/query-study-hierarchy?keyword=stroke%20AND%20single%20cell%20rna%20seq%20AND%20musculus'
+ curl 'localhost:8000/query-study-hierarchy?keyword=stroke%20AND%20single%20cell%20rna%20seq%20AND%20musculus'
 ```
-
-
-TODO: check before and after!!!
diff --git a/kilombo/model/SeveralSrpsFound.py b/kilombo/model/SeveralSrpsFound.py
@@ -0,0 +1,2 @@
+class SeveralSrpsFound(Exception):
+ pass
diff --git a/kilombo/model/failed_study_reason.py b/kilombo/model/failed_study_reason.py
@@ -3,4 +3,5 @@
 
 
 class FailedStudyReason(Enum):
- SRP_NOT_FOUND = "Not found SRP"
+ SRP_NOT_FOUND_FOR_STUDY = "Not found SRP for this study"
+ SEVERAL_SRPS_FOR_ONE_STUDY = "Several SRPs where found for this study"
diff --git a/kilombo/model/study_hierarchy.py b/kilombo/model/study_hierarchy.py
@@ -42,6 +42,10 @@ def add_srrs(self, study_id, srrs: []):
  self.successful[study_id]["srrs"] = srrs
 
  def reconcile(self):
+ self._clean_pending_studies_already_processed()
+ self._order_studies()
+
+ def _clean_pending_studies_already_processed(self):
  successful_study_ids_to_remove_from_pending = [study_id[0] for study_id in self.successful.items() if study_id[0] in self.pending.keys()]
  failed_study_ids_to_remove_from_pending = [study_id[0] for study_id in self.failed.items() if study_id[0] in self.pending.keys()]
 
@@ -53,9 +57,10 @@ def reconcile(self):
  if len(self.pending) == 0:
  del self.pending
 
- if self.pending:
+ def _order_studies(self):
+ if hasattr(self, "pending"):
  self.pending = dict(sorted(self.pending.items()))
- if self.successful:
+ if hasattr(self, "successful"):
  self.successful = dict(sorted(self.successful.items()))
- if self.failed:
+ if hasattr(self, "failed"):
  self.failed = dict(sorted(self.failed.items()))
diff --git a/kilombo/service/external/ncbi.py b/kilombo/service/external/ncbi.py
@@ -9,6 +9,7 @@
 
 from kilombo.model.failed_study import FailedStudy
 from kilombo.model.failed_study_reason import FailedStudyReason
+from kilombo.model.SeveralSrpsFound import SeveralSrpsFound
 from kilombo.model.study_hierarchy import StudyHierarchy
 
 NCBI_API_KEYS = ["ed06bd0f3c27a605d87e51e94eecab115908", "b81884ffa1519f17cae15f6bd21ac8070108"]
@@ -79,9 +80,9 @@ def _extract_srp_from_study_summaries(study_summary: dict):
 
 def _fetch_study_list(keyword: str):
  url = f"{NCBI_ESEARCH_BASE_URL}?db=gds&retmode=json&retmax=10000&term={keyword}"
- logging.debug(f"HTTP GET started ==> {url}")
+ logging.debug(f"HTTP GET Started ==> {url}")
  response = requests.get(url)
- logging.debug(f"HTTP GET done ==> {url}")
+ logging.debug(f"HTTP GET Done ==> {url}")
  return response
 
 
@@ -93,7 +94,7 @@ async def _fetch_study_summaries(study_id: int):
  api_key = NCBI_API_KEYS[0] if retries_count % 2 == 0 else NCBI_API_KEYS[1]
  url = unauthenticated_url + f"&api_key={api_key}"
  async with aiohttp.ClientSession() as session:
- logging.debug(f"HTTP GET started ==> {url}")
+ logging.debug(f"HTTP GET Started ==> {url}")
  async with session.get(url) as response:
  logging.debug(f"HTTP GET Done ==> {url}")
  if response.status == 200:
@@ -107,40 +108,62 @@ async def _fetch_study_summaries(study_id: int):
 
 async def link_study_and_accessions_alternative(study_hierarchy: StudyHierarchy):
  for study_id in study_hierarchy.pending:
- webenv_gse = _post_esearch_for_term("gds", study_hierarchy.pending[study_id]["GSE"])
+ gse = study_hierarchy.pending[study_id]["GSE"]
+ logging.info(f"Trying to get SRP for {gse} by alternative method")
+ webenv_gse = _post_esearch_for_term("gds", gse)
  srxs_for_gse = _fetch_gse_srxs(webenv_gse)
- if srxs_for_gse:
- webenv_srxs = _post_esearch_for_term("sra", " OR ".join(srxs_for_gse))
- srp = _extract_srp_from_srx_summaries(webenv_srxs)
- study_hierarchy.move_study_to_successful(study_id, srp)
- else:
- study_hierarchy.move_study_to_failed(FailedStudy(study_id, FailedStudyReason.SRP_NOT_FOUND))
+ try:
+ if srxs_for_gse:
+ webenv_srxs = _post_esearch_for_term("sra", " OR ".join(srxs_for_gse))
+ srp = _extract_srp_from_srx_summaries(webenv_srxs)
+ study_hierarchy.move_study_to_successful(study_id, srp)
+ logging.info(f"Successfully matched {gse} with {srp} by alternative method")
+ else:
+ logging.warning(f"Not found SRP for {gse}")
+ study_hierarchy.move_study_to_failed(FailedStudy(study_id, FailedStudyReason.SRP_NOT_FOUND_FOR_STUDY))
+ except SeveralSrpsFound:
+ logging.warning(f"Several SRPs found for {gse}")
+ study_hierarchy.move_study_to_failed(FailedStudy(study_id, FailedStudyReason.SEVERAL_SRPS_FOR_ONE_STUDY))
  study_hierarchy.reconcile()
 
 
 def _post_esearch_for_term(db: str, term: str):
  payload = {"db": db, "usehistory": "n", "retmode": "json", "term": term}
+ logging.debug(f"HTTP POST Started ==> {NCBI_ESEARCH_BASE_URL} with params {payload}")
  response = json.loads(requests.post(NCBI_ESEARCH_BASE_URL, data=payload).text)
+ logging.debug(f"HTTP POST Done ==> {NCBI_ESEARCH_BASE_URL} with params {payload}")
  return response["esearchresult"]["webenv"]
 
 
 def _extract_srp_from_srx_summaries(webenv: str):
  url_with_webenv = f"{NCBI_ESUMMARY_BASE_URL}?db=gds&retmode=json&WebEnv={webenv}&query_key=1"
+ logging.debug(f"HTTP GET Started ==> {url_with_webenv}")
  response = json.loads(requests.get(url_with_webenv).text)
+ logging.debug(f"HTTP GET Done ==> {url_with_webenv}")
  id_list = response["result"]["uids"]
  root_node = "root_to_avoid_parse_error"
  exp_xmls = [f"<{root_node}>{response['result'][id]['expxml']}</{root_node}>" for id in id_list]
  exp_xmls_parsed = [xmltodict.parse(exp_xml) for exp_xml in exp_xmls]
  srps = [xml[root_node]["Study"]["@acc"] for xml in exp_xmls_parsed]
- assert len(set(srps)) == 1
+ if len(set(srps)) != 1:
+ raise SeveralSrpsFound()
  return srps[0]
 
 
 def _fetch_gse_srxs(webenv: str):
- url_with_webenv = f"{NCBI_ESUMMARY_BASE_URL}?db=gds&retmode=json&WebEnv={webenv}&query_key=1"
- response = json.loads(requests.get(url_with_webenv).text)
- id_list = response["result"]["uids"]
- extrelations = [response["result"][id]["extrelations"] for id in id_list]
- extrelations = list(filter(lambda extrelation: extrelation, extrelations))
- srxs = [extrelation[0]["targetobject"] for extrelation in extrelations]
+ retstart = 0
+ batch_size = 500
+ srxs = []
+ while True:
+ url_with_webenv = f"{NCBI_ESUMMARY_BASE_URL}?db=gds&retmode=json&WebEnv={webenv}&query_key=1&retstart={retstart}&retmax={batch_size}&usehistory=y"
+ logging.debug(f"HTTP GET Started ==> {url_with_webenv}")
+ response = json.loads(requests.get(url_with_webenv).text)
+ logging.debug(f"HTTP GET Done ==> {url_with_webenv}")
+ uids = response["result"]["uids"]
+ extrelations = [response["result"][uid]["extrelations"] for uid in uids]
+ extrelations = list(filter(lambda extrelation: extrelation, extrelations))
+ srxs = srxs + [extrelation[0]["targetobject"] for extrelation in extrelations]
+ retstart += batch_size
+ if len(uids) < batch_size:
+ break
  return srxs