European-XFEL · steffenhauf · Dec 15, 2023 · philsmt · Dec 19, 2023 · steffenhauf
diff --git a/extra_data/file_access.py b/extra_data/file_access.py
@@ -146,6 +146,8 @@ def __init__(self, filename, _cache_info=None):
             self.train_ids = _cache_info['train_ids']
             self.control_sources = _cache_info['control_sources']
             self.instrument_sources = _cache_info['instrument_sources']
+            self.reduction_data = _cache_info['reduction_data']
+            self.errata = _cache_info['errata']
             self.validity_flag = _cache_info.get('flag', None)
         else:
             try:
@@ -155,7 +157,8 @@ def __init__(self, filename, _cache_info=None):
 
             self.train_ids = tid_data[tid_data != 0]
 
-            self.control_sources, self.instrument_sources = self._read_data_sources()
+            (self.control_sources, self.instrument_sources,
+                self.reduction_data, self.errata) = self._read_data_sources()
 
             self.validity_flag = None
 
@@ -295,6 +298,7 @@ def format_version(self):
 
     def _read_data_sources(self):
         control_sources, instrument_sources = set(), set()
+        reduction_info, errata = set(), set()
 
         # The list of data sources moved in file format 1.0
         if self.format_version == '0.5':
@@ -320,14 +324,19 @@ def _read_data_sources(self):
                 # TODO: Do something with groups?
             elif category == 'CONTROL':
                 control_sources.add(h5_source)
+            elif category == 'REDUCTION':
+                reduction_info.add(h5_source)
+            elif category == 'ERRATA':
+                errata.add(h5_source)
             elif category == 'Karabo_TimerServer':
                 # Ignore virtual data source used only in file format
                 # version 1.1 / pclayer-1.10.3-2.10.5.
                 pass
             else:
                 raise ValueError("Unknown data category %r" % category)
 
-        return frozenset(control_sources), frozenset(instrument_sources)
+        return (frozenset(control_sources), frozenset(instrument_sources),
+                frozenset(reduction_info), frozenset(errata))
 
     def _guess_valid_trains(self):
         # File format version 1.0 includes a flag which is 0 if a train ID
@@ -418,6 +427,10 @@ def index_groups(self, source):
             return {''}
         elif source in self.instrument_sources:
             return set(self.file[f'/INDEX/{source}'].keys())
+        elif source in self.reduction_data:
+            return set(self.file[f'/INDEX/{source}'].keys())
+        elif source in self.errata:
+            return set(self.file[f'/INDEX/{source}'].keys())
         else:
             raise SourceNameError(source)
 
@@ -454,6 +467,10 @@ def get_keys(self, source):
             group = '/CONTROL/' + source
         elif source in self.instrument_sources:
             group = '/INSTRUMENT/' + source
+        elif source in self.reduction_data:
+            group = '/REDUCTION/' + source
+        elif source in self.errata:
+            group = '/ERRATA/' + source
         else:
             raise SourceNameError(source)
 
@@ -478,6 +495,10 @@ def get_one_key(self, source):
             group = '/CONTROL/' + source
         elif source in self.instrument_sources:
             group = '/INSTRUMENT/' + source
+        elif source in self.reduction_data:
+            group = '/REDUCTION/' + source
+        elif source in self.errata:
+            group = '/ERRATA/' + source
         else:
             raise SourceNameError(source)
 
@@ -527,6 +548,10 @@ def has_source_key(self, source, key):
             path = '/CONTROL/{}/{}'.format(source, key.replace('.', '/'))
         elif source in self.instrument_sources:
             path = '/INSTRUMENT/{}/{}'.format(source, key.replace('.', '/'))
+        elif source in self.reduction_data:
+            path = '/REDUCTION/{}/{}'.format(source, key.replace('.', '/'))
+        elif source in self.errata:
+            path = '/ERRATA/{}/{}'.format(source, key.replace('.', '/'))
         else:
             raise SourceNameError(source)
 

diff --git a/extra_data/reader.py b/extra_data/reader.py
@@ -101,6 +101,10 @@ def __init__(
                     files_by_sources[source, 'CONTROL'].append(f)
                 for source in f.instrument_sources:
                     files_by_sources[source, 'INSTRUMENT'].append(f)
+                for source in f.reduction_data:
+                    files_by_sources[source, 'REDUCTION'].append(f)
+                for source in f.errata:
+                    files_by_sources[source, 'ERRATA'].append(f)
             sources_data = {
                 src: SourceData(src,
                     sel_keys=None,
@@ -127,6 +131,14 @@ def __init__(
             name for (name, sd) in self._sources_data.items()
             if sd.section == 'INSTRUMENT'
         })
+        self.reduction_data = frozenset({
+            name for (name, sd) in self._sources_data.items()
+            if sd.section == 'REDUCTION'
+        })
+        self.errata = frozenset({
+            name for (name, sd) in self._sources_data.items()
+            if sd.section == 'ERRATA'
+        })
 
     @staticmethod
     def _open_file(path, cache_info=None):
@@ -401,26 +413,27 @@ def train_from_id(
                 path = '/CONTROL/{}/{}'.format(source, key.replace('.', '/'))
                 source_data[key] = file.file[path][first]
 
-        for source in self.instrument_sources:
-            source_data = res[source] = {
-                'metadata': {'source': source, 'timestamp.tid': train_id}
-            }
-            file, pos = self._find_data(source, train_id)
-            if file is None:
-                continue
-
-            for key in self.keys_for_source(source):
-                group = key.partition('.')[0]
-                firsts, counts = file.get_index(source, group)
-                first, count = firsts[pos], counts[pos]
-                if not count:
+        for prefix, category in (("INSTRUMENT", self.instrument_sources), ("REDUCTION", self.reduction_data), ("ERRATA", self.errata)):
+            for source in category:
+                source_data = res[source] = {
+                    'metadata': {'source': source, 'timestamp.tid': train_id}
+                }
+                file, pos = self._find_data(source, train_id)
+                if file is None:
                     continue
 
-                path = '/INSTRUMENT/{}/{}'.format(source, key.replace('.', '/'))
-                if count == 1 and not keep_dims:
-                    source_data[key] = file.file[path][first]
-                else:
-                    source_data[key] = file.file[path][first : first + count]
+                for key in self.keys_for_source(source):
+                    group = key.partition('.')[0]
+                    firsts, counts = file.get_index(source, group)
+                    first, count = firsts[pos], counts[pos]
+                    if not count:
+                        continue
+
+                    path = '/{}/{}/{}'.format(prefix, source, key.replace('.', '/'))
+                    if count == 1 and not keep_dims:
+                        source_data[key] = file.file[path][first]
+                    else:
+                        source_data[key] = file.file[path][first : first + count]
 
         if flat_keys:
             # {src: {key: data}} -> {(src, key): data}

diff --git a/extra_data/run_files_map.py b/extra_data/run_files_map.py
@@ -132,7 +132,9 @@ def get(self, path):
             res = {
                 'train_ids': np.array(d['train_ids'], dtype=np.uint64),
                 'control_sources': frozenset(d['control_sources']),
-                'instrument_sources': frozenset(d['instrument_sources'])
+                'instrument_sources': frozenset(d['instrument_sources']),
+                'reduction_data': frozenset(d.get('reduction_data', set())),
+                'errata': frozenset(d.get('errata', set()))
             }
             # Older cache files don't contain info on 'suspect' trains.
             if 'suspect_train_indices' in d:
@@ -182,6 +184,8 @@ def save(self, files):
                     'train_ids': [int(t) for t in file_access.train_ids],
                     'control_sources': sorted(file_access.control_sources),
                     'instrument_sources': sorted(file_access.instrument_sources),
+                    'reduction_data': sorted(file_access.reduction_data),
+                    'errata': sorted(file_access.errata),
                     'suspect_train_indices': [
                         int(i) for i in (~file_access.validity_flag).nonzero()[0]
                     ],