Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
arcones committed Nov 5, 2023
1 parent 44fafbe commit cf283ee
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 26 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build-and-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ jobs:
steps:
- run: curl -sSL https://install.python-poetry.org | python3 -
- name: Check out repository code
uses: actions/checkout@v3
uses: actions/checkout@v4
- run: poetry env use 3.10.12 && poetry env info
- run: poetry install
- run: cd tests && poetry run pytest
5 changes: 1 addition & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,5 @@ Get installed [poetry](https://python-poetry.org/) and then:
Server will listen in port 8000. You can check the functionality with this example query:

```shell
curl --location 'localhost:8000/query-study-hierarchy?keyword=stroke%20AND%20single%20cell%20rna%20seq%20AND%20musculus'
curl 'localhost:8000/query-study-hierarchy?keyword=stroke%20AND%20single%20cell%20rna%20seq%20AND%20musculus'
```


TODO: check before and after!!!
2 changes: 2 additions & 0 deletions kilombo/model/SeveralSrpsFound.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
class SeveralSrpsFound(Exception):
pass
3 changes: 2 additions & 1 deletion kilombo/model/failed_study_reason.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@


class FailedStudyReason(Enum):
SRP_NOT_FOUND = "Not found SRP"
SRP_NOT_FOUND_FOR_STUDY = "Not found SRP for this study"
SEVERAL_SRPS_FOR_ONE_STUDY = "Several SRPs where found for this study"
11 changes: 8 additions & 3 deletions kilombo/model/study_hierarchy.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ def add_srrs(self, study_id, srrs: []):
self.successful[study_id]["srrs"] = srrs

def reconcile(self):
self._clean_pending_studies_already_processed()
self._order_studies()

def _clean_pending_studies_already_processed(self):
successful_study_ids_to_remove_from_pending = [study_id[0] for study_id in self.successful.items() if study_id[0] in self.pending.keys()]
failed_study_ids_to_remove_from_pending = [study_id[0] for study_id in self.failed.items() if study_id[0] in self.pending.keys()]

Expand All @@ -53,9 +57,10 @@ def reconcile(self):
if len(self.pending) == 0:
del self.pending

if self.pending:
def _order_studies(self):
if hasattr(self, "pending"):
self.pending = dict(sorted(self.pending.items()))
if self.successful:
if hasattr(self, "successful"):
self.successful = dict(sorted(self.successful.items()))
if self.failed:
if hasattr(self, "failed"):
self.failed = dict(sorted(self.failed.items()))
57 changes: 40 additions & 17 deletions kilombo/service/external/ncbi.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from kilombo.model.failed_study import FailedStudy
from kilombo.model.failed_study_reason import FailedStudyReason
from kilombo.model.SeveralSrpsFound import SeveralSrpsFound
from kilombo.model.study_hierarchy import StudyHierarchy

NCBI_API_KEYS = ["ed06bd0f3c27a605d87e51e94eecab115908", "b81884ffa1519f17cae15f6bd21ac8070108"]
Expand Down Expand Up @@ -79,9 +80,9 @@ def _extract_srp_from_study_summaries(study_summary: dict):

def _fetch_study_list(keyword: str):
url = f"{NCBI_ESEARCH_BASE_URL}?db=gds&retmode=json&retmax=10000&term={keyword}"
logging.debug(f"HTTP GET started ==> {url}")
logging.debug(f"HTTP GET Started ==> {url}")
response = requests.get(url)
logging.debug(f"HTTP GET done ==> {url}")
logging.debug(f"HTTP GET Done ==> {url}")
return response


Expand All @@ -93,7 +94,7 @@ async def _fetch_study_summaries(study_id: int):
api_key = NCBI_API_KEYS[0] if retries_count % 2 == 0 else NCBI_API_KEYS[1]
url = unauthenticated_url + f"&api_key={api_key}"
async with aiohttp.ClientSession() as session:
logging.debug(f"HTTP GET started ==> {url}")
logging.debug(f"HTTP GET Started ==> {url}")
async with session.get(url) as response:
logging.debug(f"HTTP GET Done ==> {url}")
if response.status == 200:
Expand All @@ -107,40 +108,62 @@ async def _fetch_study_summaries(study_id: int):

async def link_study_and_accessions_alternative(study_hierarchy: StudyHierarchy):
for study_id in study_hierarchy.pending:
webenv_gse = _post_esearch_for_term("gds", study_hierarchy.pending[study_id]["GSE"])
gse = study_hierarchy.pending[study_id]["GSE"]
logging.info(f"Trying to get SRP for {gse} by alternative method")
webenv_gse = _post_esearch_for_term("gds", gse)
srxs_for_gse = _fetch_gse_srxs(webenv_gse)
if srxs_for_gse:
webenv_srxs = _post_esearch_for_term("sra", " OR ".join(srxs_for_gse))
srp = _extract_srp_from_srx_summaries(webenv_srxs)
study_hierarchy.move_study_to_successful(study_id, srp)
else:
study_hierarchy.move_study_to_failed(FailedStudy(study_id, FailedStudyReason.SRP_NOT_FOUND))
try:
if srxs_for_gse:
webenv_srxs = _post_esearch_for_term("sra", " OR ".join(srxs_for_gse))
srp = _extract_srp_from_srx_summaries(webenv_srxs)
study_hierarchy.move_study_to_successful(study_id, srp)
logging.info(f"Successfully matched {gse} with {srp} by alternative method")
else:
logging.warning(f"Not found SRP for {gse}")
study_hierarchy.move_study_to_failed(FailedStudy(study_id, FailedStudyReason.SRP_NOT_FOUND_FOR_STUDY))
except SeveralSrpsFound:
logging.warning(f"Several SRPs found for {gse}")
study_hierarchy.move_study_to_failed(FailedStudy(study_id, FailedStudyReason.SEVERAL_SRPS_FOR_ONE_STUDY))
study_hierarchy.reconcile()


def _post_esearch_for_term(db: str, term: str):
payload = {"db": db, "usehistory": "n", "retmode": "json", "term": term}
logging.debug(f"HTTP POST Started ==> {NCBI_ESEARCH_BASE_URL} with params {payload}")
response = json.loads(requests.post(NCBI_ESEARCH_BASE_URL, data=payload).text)
logging.debug(f"HTTP POST Done ==> {NCBI_ESEARCH_BASE_URL} with params {payload}")
return response["esearchresult"]["webenv"]


def _extract_srp_from_srx_summaries(webenv: str):
url_with_webenv = f"{NCBI_ESUMMARY_BASE_URL}?db=gds&retmode=json&WebEnv={webenv}&query_key=1"
logging.debug(f"HTTP GET Started ==> {url_with_webenv}")
response = json.loads(requests.get(url_with_webenv).text)
logging.debug(f"HTTP GET Done ==> {url_with_webenv}")
id_list = response["result"]["uids"]
root_node = "root_to_avoid_parse_error"
exp_xmls = [f"<{root_node}>{response['result'][id]['expxml']}</{root_node}>" for id in id_list]
exp_xmls_parsed = [xmltodict.parse(exp_xml) for exp_xml in exp_xmls]
srps = [xml[root_node]["Study"]["@acc"] for xml in exp_xmls_parsed]
assert len(set(srps)) == 1
if len(set(srps)) != 1:
raise SeveralSrpsFound()
return srps[0]


def _fetch_gse_srxs(webenv: str):
url_with_webenv = f"{NCBI_ESUMMARY_BASE_URL}?db=gds&retmode=json&WebEnv={webenv}&query_key=1"
response = json.loads(requests.get(url_with_webenv).text)
id_list = response["result"]["uids"]
extrelations = [response["result"][id]["extrelations"] for id in id_list]
extrelations = list(filter(lambda extrelation: extrelation, extrelations))
srxs = [extrelation[0]["targetobject"] for extrelation in extrelations]
retstart = 0
batch_size = 500
srxs = []
while True:
url_with_webenv = f"{NCBI_ESUMMARY_BASE_URL}?db=gds&retmode=json&WebEnv={webenv}&query_key=1&retstart={retstart}&retmax={batch_size}&usehistory=y"
logging.debug(f"HTTP GET Started ==> {url_with_webenv}")
response = json.loads(requests.get(url_with_webenv).text)
logging.debug(f"HTTP GET Done ==> {url_with_webenv}")
uids = response["result"]["uids"]
extrelations = [response["result"][uid]["extrelations"] for uid in uids]
extrelations = list(filter(lambda extrelation: extrelation, extrelations))
srxs = srxs + [extrelation[0]["targetobject"] for extrelation in extrelations]
retstart += batch_size
if len(uids) < batch_size:
break
return srxs

0 comments on commit cf283ee

Please sign in to comment.