Skip to content

Commit

Permalink
Merge pull request #174 from hubmapconsortium/mruffalo/expose-manifes…
Browse files Browse the repository at this point in the history
…t-qa-qc

Expose manifest `is_qa_qc` field to Elasticsearch
  • Loading branch information
jswelling authored Oct 20, 2020
2 parents 620e7c3 + 21eb367 commit 16078bf
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 11 deletions.
28 changes: 17 additions & 11 deletions src/ingest-pipeline/airflow/dags/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,12 @@
COMPILED_WORKFLOW_MAP: Optional[List[Tuple[Pattern, Pattern, str]]] = None


ManifestMatch = Tuple[bool, Optional[str], Optional[str], Optional[bool]]


class FileMatcher(ABC):
@abstractmethod
def get_file_metadata(self, file_path: Path) -> Tuple[bool, Optional[str], Optional[str]]:
def get_file_metadata(self, file_path: Path) -> ManifestMatch:
"""
:return: A 3-tuple:
[0] bool, whether to add `file_path` to a downstream index
Expand All @@ -113,20 +116,21 @@ def get_file_metadata(self, file_path: Path) -> Tuple[bool, Optional[str], Optio

class PipelineFileMatcher(FileMatcher):
# (file/directory regex, description template, EDAM ontology term)
matchers: List[Tuple[Pattern, str, str]]
matchers: List[Tuple[Pattern, str, str, bool]]

def __init__(self):
self.matchers = []

@classmethod
def read_manifest(cls, pipeline_file_manifest: Path) -> Iterable[Tuple[Pattern, str, str]]:
def read_manifest(cls, pipeline_file_manifest: Path) -> Iterable[Tuple[Pattern, str, str, bool]]:
with open(pipeline_file_manifest) as f:
manifest = json.load(f)
localized_assert_json_matches_schema(manifest, 'pipeline_file_manifest.yml')

for annotation in manifest:
pattern = re.compile(annotation['pattern'])
yield pattern, annotation['description'], annotation['edam_ontology_term']
is_qa_qc = annotation.get('is_qa_qc', False)
yield pattern, annotation['description'], annotation['edam_ontology_term'], is_qa_qc

@classmethod
def create_from_files(cls, pipeline_file_manifests: Iterable[Path]):
Expand All @@ -135,30 +139,30 @@ def create_from_files(cls, pipeline_file_manifests: Iterable[Path]):
obj.matchers.extend(cls.read_manifest(manifest))
return obj

def get_file_metadata(self, file_path: Path) -> Tuple[bool, Optional[str], Optional[str]]:
def get_file_metadata(self, file_path: Path) -> ManifestMatch:
"""
Checks `file_path` against the list of patterns stored in this object.
At the first match, return the associated description and ontology term.
If no match, return `None`. Patterns are ordered in the JSON file, so
the "first-match" behavior is deliberate.
"""
path_str = fspath(file_path)
for pattern, description_template, ontology_term in self.matchers:
for pattern, description_template, ontology_term, is_qa_qc in self.matchers:
# TODO: walrus operator
m = pattern.search(path_str)
if m:
formatted_description = description_template.format_map(m.groupdict())
return True, formatted_description, ontology_term
return False, None, None
return True, formatted_description, ontology_term, is_qa_qc
return False, None, None, None


class DummyFileMatcher(FileMatcher):
"""
Drop-in replacement for PipelineFileMatcher which allows everything and always
provides empty descriptions and ontology terms.
"""
def get_file_metadata(self, file_path: Path) -> Tuple[bool, Optional[str], Optional[str]]:
return True, '', ''
def get_file_metadata(self, file_path: Path) -> ManifestMatch:
return True, '', '', False


def find_pipeline_manifests(cwl_files: Iterable[Path]) -> List[Path]:
Expand Down Expand Up @@ -376,6 +380,7 @@ def get_file_metadata(root_dir: str, matcher: FileMatcher) -> List[Mapping[str,
'size': <file size>,
'description': <human-readable file description>,
'edam_term': <EDAM ontology term>,
'is_qa_qc': <Boolean of whether this is a QA/QC file>,
},
...
]
Expand All @@ -389,7 +394,7 @@ def get_file_metadata(root_dir: str, matcher: FileMatcher) -> List[Mapping[str,
for fn in fnames:
full_path = dp / fn
relative_path = full_path.relative_to(root_path)
add_to_index, description, ontology_term = matcher.get_file_metadata(relative_path)
add_to_index, description, ontology_term, is_qa_qc = matcher.get_file_metadata(relative_path)
if add_to_index:
# sha1sum disabled because of run time issues on large data collections
#line = check_output([word.format(fname=full_path)
Expand All @@ -402,6 +407,7 @@ def get_file_metadata(root_dir: str, matcher: FileMatcher) -> List[Mapping[str,
'size': getsize(full_path),
'description': description,
'edam_term': ontology_term,
'is_qa_qc': is_qa_qc,
#'sha1sum': cs,
}
)
Expand Down
3 changes: 3 additions & 0 deletions src/ingest-pipeline/schemata/file_info_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@
},
"edam_term": {
"type": "string"
},
"is_qa_qc": {
"type": "boolean"
}
}
},
Expand Down
1 change: 1 addition & 0 deletions src/ingest-pipeline/schemata/file_info_schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
'size': {'type': 'integer', 'minimum': 0}
'description': {'type': 'string'}
'edam_term': {'type': 'string'}
'is_qa_qc': { 'type': 'boolean' }
#'sha1sum': {'type': 'string', 'pattern': '^[a-fA-F0-9]{40}$'}
'file_info':
'type': 'array'
Expand Down
4 changes: 4 additions & 0 deletions src/ingest-pipeline/schemata/pipeline_file_manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
"edam_ontology_term": {
"type": "string",
"description": "Term in the EDAM ontology describing this pipeline output file"
},
"is_qa_qc": {
"type": "boolean",
"description": "Whether this file is a QA/QC report"
}
},
"required": ["pattern", "description", "edam_ontology_term"]
Expand Down
3 changes: 3 additions & 0 deletions src/ingest-pipeline/schemata/pipeline_file_manifest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,7 @@
'edam_ontology_term':
'type': 'string'
'description': 'Term in the EDAM ontology describing this pipeline output file'
'is_qa_qc':
'type': 'boolean'
'description': 'Whether this file is a QA/QC report'
'required': ['pattern', 'description', 'edam_ontology_term']

0 comments on commit 16078bf

Please sign in to comment.