Skip to content

Commit

Permalink
Bug fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
axdanbol committed Feb 23, 2024
1 parent 76c28b9 commit 62fcd09
Show file tree
Hide file tree
Showing 13 changed files with 99 additions and 34 deletions.
22 changes: 21 additions & 1 deletion package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
},
"dependencies": {
"glob": "^10.3.10",
"papaparse": "^5.4.1"
"papaparse": "^5.4.1",
"undici": "^6.6.2"
}
}
11 changes: 7 additions & 4 deletions src/cellxgene/downloader.js
Original file line number Diff line number Diff line change
Expand Up @@ -82,17 +82,20 @@ export class Downloader {
dataset.block_id = `${dataset.id}_Block`;
dataset.dataset_id = dataset.id;

// Parse sex line. Format: `sex: X\n`
const sex_match = /sex:(.+)\n/i.exec(stdout);
dataset.donor_sex = sex_match?.[1].trim() ?? '';

// Parse age line. Format: `age: X\n`
const age_match = /age:(.+)\n/i.exec(stdout);
dataset.donor_development_stage = age_match?.[1].trim() ?? '';

// Parse age line. Format: `age: X\n`
const ethnicity_match = /ethnicity:(.+)\n/i.exec(stdout);
dataset.donor_race = ethnicity_match?.[1].trim() ?? '';

const cell_count_match = /cell_count:\s*(\d+)\s*\n/i.exec(stdout);
dataset.dataset_cell_count = parseInt(cell_count_match?.[1]);

const gene_count_match = /gene_count:\s*(\d+)\s*\n/i.exec(stdout);
dataset.dataset_gene_count = parseInt(gene_count_match?.[1]);
}

/**
Expand Down Expand Up @@ -126,7 +129,7 @@ export class Downloader {
const { url } = await resp.json();
const outputFile = join(getCacheDir(this.config), `cellxgene-${id}.h5ad`);

await logEvent('CellXGene:DownloadAsset', id, () =>
await logEvent('CellXGene:DownloadAsset', id, url, () =>
downloadFile(outputFile, url, {
overwrite: this.config.get(FORCE, false),
})
Expand Down
4 changes: 3 additions & 1 deletion src/cellxgene/extract_donor_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

def main(args: argparse.Namespace):
"""Print information from a h5ad file.
Printed values includes "sex", "age", and "ethnicity".
Printed values includes "sex", "age", "ethnicity", "cell_count", and "gene_count".
Args:
args (argparse.Namespace): CLI arguments, must contain "file"
Expand All @@ -19,6 +19,8 @@ def main(args: argparse.Namespace):
print("sex:", data.obs[SEX_COLUMN][0])
print("age:", data.obs[AGE_COLUMN][0])
print("ethnicity:", data.obs[ETHNICITY_COLUMN][0])
print("cell_count:", len(data.obs))
print("gene_count:", len(data.var))


def _get_arg_parser() -> argparse.ArgumentParser:
Expand Down
3 changes: 3 additions & 0 deletions src/combine-metadata.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ const METADATA_FIELDS = [
'dataset_id',
'dataset_link',
'dataset_technology',
'dataset_info',
'dataset_cell_count',
'dataset_gene_count',

'publication',
'publication_title',
Expand Down
15 changes: 7 additions & 8 deletions src/gtex/downloader.js
Original file line number Diff line number Diff line change
Expand Up @@ -121,37 +121,36 @@ export class Downloader {
dataset.dataFilePath,
]);

// Parse organ line. Format: `organ: X\n`
const organ_match = /organ:(.+)\n/i.exec(stdout);
dataset.organ_source = organ_match?.[1].trim() ?? '';

const organ = dataset.organ_source.toLowerCase();
dataset.organ = this.organMetadata.resolve(ORGAN_MAPPING[organ] ?? '');
dataset.organ_id = dataset.organ ? `http://purl.obolibrary.org/obo/UBERON_${dataset.organ.split(':')[1]}` : '';

// Parse sex line. Format: `sex: X\n`
const sex_match = /sex:(.+)\n/i.exec(stdout);
dataset.donor_sex = sex_match?.[1].trim() ?? '';

// Parse age line. Format: `age: X\n`
const age_match = /age:(.+)\n/i.exec(stdout);
dataset.donor_age_bin = age_match?.[1].trim() ?? '';

// Parse donor_id line. Format: `donor_id: X\n`
const donor_id_match = /donor_id:(.+)\n/i.exec(stdout);
dataset.donor_id = `${GTEX_DOI_URL}#${donor_id_match?.[1].trim()}` ?? '';

dataset.organ_id = dataset.organ ? `http://purl.obolibrary.org/obo/UBERON_${dataset.organ.split(':')[1]}` : '';
const cell_count_match = /cell_count:\s*(\d+)\s*\n/i.exec(stdout);
dataset.dataset_cell_count = parseInt(cell_count_match?.[1]);

const gene_count_match = /gene_count:\s*(\d+)\s*\n/i.exec(stdout);
dataset.dataset_gene_count = parseInt(gene_count_match?.[1]);

// Parse tissue_site line. Format: `tissue_site: X\n`
const tissue_site_match = /tissue_site:(.+)\n/i.exec(stdout);
const tissueSite =
`${GTEX_BLOCK_URL}${tissue_site_match?.[1]
.trim()
.replace(/[^a-zA-Z]+/g, '_')
.replace(/_$/, '')}` ?? '';

dataset.rui_location = this.extractionSiteLookup[tissueSite] ?? '';

dataset.block_id = `${dataset.dataset_id}_TissueBlock`;
dataset.rui_location = this.extractionSiteLookup[tissueSite] ?? '';
}
}
4 changes: 3 additions & 1 deletion src/gtex/extract_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

def main(args: argparse.Namespace):
"""Subsets and prints information from a h5ad file.
Printed values include "organ", "sex", "age", "donor_id", and "tissue_site".
Printed values include "organ", "sex", "age", "donor_id", "cell_count", "gene_count" and "tissue_site".
Args:
args (argparse.Namespace): CLI arguments, must include "file", "dataset", and "output"
Expand All @@ -27,6 +27,8 @@ def main(args: argparse.Namespace):
print("sex:", subset.obs[SEX_COLUMN][0])
print("age:", subset.obs[AGE_COLUMN][0])
print("donor_id:", subset.obs[DONOR_ID_COLUMN][0])
print("cell_count:", len(subset.obs))
print("gene_count:", len(subset.var))
print("tissue_site:", subset.obs[TISSUE_SITE_COLUMN][0], flush=True)


Expand Down
7 changes: 4 additions & 3 deletions src/hubmap/metadata.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ export const METADATA_FIELDS = [
'uuid',
'hubmap_id',
'origin_samples.organ',
'data_types', // TODO: Replace with dataset_type
'dataset_info',
'mapped_consortium',
'group_name',
'group_uuid',
Expand All @@ -51,7 +51,7 @@ export function metadataToLookup(result, organMetadata) {
hubmap_id,
uuid,
origin_samples: [{ organ }],
data_types: [assay_type],
dataset_info,
mapped_consortium,
group_name,
group_uuid,
Expand All @@ -73,10 +73,11 @@ export function metadataToLookup(result, organMetadata) {
organ: mapped_organ,
organ_source: organ,
uuid,
assay_type,
assay_type: dataset_info.split('__')[0],
dataset_id: `${HUBMAP_ENTITY_ENDPOINT}${uuid}`,
dataset_link: `${HUBMAP_PORTAL_ENDPOINT}${uuid}`,
dataset_technology: 'OTHER',
dataset_info,
consortium_name: mapped_consortium,
provider_name: group_name,
provider_uuid: group_uuid,
Expand Down
34 changes: 26 additions & 8 deletions src/organ/metadata.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,20 @@ import { ALGORITHMS, DEFAULT_MAX_CONCURRENCY, FORCE, MAX_CONCURRENCY } from '../
import { downloadFile, ensureDirsExist } from '../util/fs.js';
import { getOutputDir } from '../util/paths.js';

/**
* @typedef {Record<string, string | Record<string, any>>} RawOrganMetadata
*/

/** Template for organ metadata file urls */
const ORGAN_METADATA_URL_TEMPLATE =
'https://raw.githubusercontent.com/hubmapconsortium/hra-workflows/main/containers/{{algorithm}}/context/organ-metadata.json';

/**
* Raw metadata loaded from file cached by algorithm
* @type {Map<string, Promise<RawOrganMetadata>>}
*/
const cachedMetadataFileDownload = new Map();

/**
* Tests whether a value is a string
*
Expand Down Expand Up @@ -38,17 +48,25 @@ export class OrganMetadata {
* @param {Config} config Configuration
*/
static async load(algorithm, config) {
const url = ORGAN_METADATA_URL_TEMPLATE.replace('{{algorithm}}', algorithm);
const dir = join(getOutputDir(config), 'organ-metadata');
const file = join(dir, `${algorithm}.json`);
if (!cachedMetadataFileDownload.has(algorithm)) {
const url = ORGAN_METADATA_URL_TEMPLATE.replace('{{algorithm}}', algorithm);
const dir = join(getOutputDir(config), 'organ-metadata');
const file = join(dir, `${algorithm}.json`);
const download = async () => {
await ensureDirsExist(dir);
await downloadFile(file, url, { overwrite: config.get(FORCE, false) });
return await loadJson(file);
};

cachedMetadataFileDownload.set(algorithm, download());
}

await ensureDirsExist(dir);
await downloadFile(file, url, { overwrite: config.get(FORCE, false) });
return new OrganMetadata(await loadJson(file));
const result = await cachedMetadataFileDownload.get(algorithm);
return new OrganMetadata({ ...result });
}

constructor(metadata) {
/** @type {Record<string, string | Record<string, any>>} */
/** @type {RawOrganMetadata} */
this.metadata = metadata;
}

Expand All @@ -64,7 +82,7 @@ export class OrganMetadata {
* May return a redirect organ code.
*
* @param {string} organ Organ code
* @returns {string | Record<string, any> | undefined}
* @returns {RawOrganMetadata[string] | undefined}
*/
get(organ) {
return this.metadata[organ];
Expand Down
8 changes: 4 additions & 4 deletions src/sennet/metadata.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ export const METADATA_FIELDS = [
'uuid',
'sennet_id',
'origin_sample.organ',
'dataset_type',
'dataset_info',
'group_name',
'group_uuid',
'source.source_mapped_metadata.race',
Expand All @@ -52,8 +52,7 @@ export function toLookup(result, organMetadata) {
sennet_id,
uuid,
origin_sample: { organ },
dataset_type: assay_type,
dataset_info: i,
dataset_info,
group_name,
group_uuid,
source: {
Expand All @@ -75,10 +74,11 @@ export function toLookup(result, organMetadata) {
organ: mapped_organ,
organ_source: organ,
uuid,
assay_type,
assay_type: dataset_info.split('__')[0],
dataset_id: `${SENNET_ENTITY_ENDPOINT}${uuid}`,
dataset_link: `${SENNET_PORTAL_ENDPOINT}?uuid=${uuid}`,
dataset_technology: 'OTHER',
dataset_info,
consortium_name: 'SenNet',
provider_name: group_name,
provider_uuid: group_uuid,
Expand Down
6 changes: 5 additions & 1 deletion src/util/fs.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { access, constants, mkdir, open, writeFile } from 'node:fs/promises';
import { Agent } from 'undici';
import { concurrentMap } from './concurrent-map.js';

/**
Expand Down Expand Up @@ -52,7 +53,10 @@ export async function downloadFile(dest, src, options = {}) {
}

try {
const resp = await fetch(src, options);
const resp = await fetch(src, {
dispatcher: new Agent({ connectTimeout: 300e3 }),
...options,
});
checkFetchResponse(resp);

await writeFile(fileHandle, resp.body, { encoding: 'utf8' });
Expand Down
10 changes: 8 additions & 2 deletions src/xconsortia/downloader.js
Original file line number Diff line number Diff line change
Expand Up @@ -48,20 +48,26 @@ export class XConsortiaDownloader {
overwrite: this.config.get(FORCE, false),
});

await execFile('python3', [
const { stdout } = await execFile('python3', [
this.exprAdjustScriptFilePath,
dataset.dataFilePath,
'--assay',
dataset.assay_type,
'--output',
dataset.dataFilePath,
]);

const cell_count_match = /cell_count:\s*(\d+)\s*\n/i.exec(stdout);
dataset.dataset_cell_count = parseInt(cell_count_match?.[1]);

const gene_count_match = /gene_count:\s*(\d+)\s*\n/i.exec(stdout);
dataset.dataset_gene_count = parseInt(gene_count_match?.[1]);
}

/**
* Get a lookup map for associating metadata with a dataset.
* Must be overrriden in subclasses.
*
*
* @param {string[]} ids Dataset ids
* @returns {Promise<Map<string, object>>} Lookup map
*/
Expand Down
6 changes: 6 additions & 0 deletions src/xconsortia/expr_h5ad_adjust.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class AnnDataLayer(str, Enum):
SPLICED_UNSPLICED_SUM = "spliced_unspliced_sum"


# https://github.com/hubmapconsortium/ingest-pipeline/blob/master/src/ingest-pipeline/airflow/dags/utils.py#L350
ASSAY_TO_LAYER_MAP = {
"salmon_sn_rnaseq_10x": AnnDataLayer.SPLICED_UNSPLICED_SUM,
"salmon_rnaseq_snareseq": AnnDataLayer.SPLICED_UNSPLICED_SUM,
Expand All @@ -25,6 +26,7 @@ class AnnDataLayer(str, Enum):

def main(args: argparse.Namespace):
"""Replaces the X matrix with a layer depending on assay type and writes the new data to file.
Also prints the number of cells (rows) and genes (columns) to stdout.
Args:
args (argparse.Namespace): CLI arguments, must contain "file", "assay", and "output"
Expand All @@ -40,6 +42,10 @@ def main(args: argparse.Namespace):
adata.X = adata.layers[layer]
else:
raise ValueError(f"Layer {layer} not found")

print('cell_count:', len(adata.obs))
print('gene_count:', len(adata.var))

adata.write_h5ad(args.output)


Expand Down

0 comments on commit 62fcd09

Please sign in to comment.