Skip to content

Commit

Permalink
Merge pull request astropy#3173 from keflavich/cdms_cats
Browse files Browse the repository at this point in the history
CDMS: add whole-molecule query tool & refactor metadata acquisition
  • Loading branch information
bsipocz authored Jan 17, 2025
2 parents ca584f6 + 742289f commit b005fc9
Show file tree
Hide file tree
Showing 11 changed files with 2,904 additions and 1,227 deletions.
11 changes: 11 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,17 @@ New Tools and Services
Service fixes and enhancements
------------------------------

linelists.cdms
^^^^^^^^^^^^^^

- Add whole catalog retrieval, improve error messaging for unparseable lines,
improve metadata catalog, and improve lookuptable behavior [#3173,#2901]

jplspec
^^^^^^^

- minor improvement to lookuptable behavior [#3173,#2901]


Infrastructure, Utility and Other Changes and Additions
-------------------------------------------------------
Expand Down
17 changes: 10 additions & 7 deletions astroquery/jplspec/lookup_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@

class Lookuptable(dict):

def find(self, s, flags):
def find(self, st, flags):
"""
Search dictionary keys for a regex match to string s
Parameters
----------
s : str
st : str
String to compile as a regular expression
Can be entered non-specific for broader results
('H2O' yields 'H2O' but will also yield 'HCCCH2OD')
Expand All @@ -22,17 +22,20 @@ def find(self, s, flags):
Returns
-------
The list of values corresponding to the matches
The dictionary containing only values whose keys match the regex
"""

R = re.compile(s, flags)
if st in self:
return {st: self[st]}

R = re.compile(st, flags)

out = {}

for k, v in self.items():
match = R.search(str(k))
for key, val in self.items():
match = R.search(str(key))
if match:
out[k] = v
out[key] = val

return out
12 changes: 12 additions & 0 deletions astroquery/linelists/cdms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,25 @@ class Conf(_config.ConfigNamespace):
Configuration parameters for `astroquery.linelists.cdms`.
"""
server = _config.ConfigItem(
'https://cdms.astro.uni-koeln.de/',
'CDMS Search and Conversion Form URL.')

search = _config.ConfigItem(
'https://cdms.astro.uni-koeln.de/cgi-bin/cdmssearch',
'CDMS Search and Conversion Form URL.')

catfile_url = _config.ConfigItem(
'https://cdms.astro.uni-koeln.de/classic/entries/partition_function.html',
'CDMS partition function table listing all available molecules.')

catfile_url2 = _config.ConfigItem(
'https://cdms.astro.uni-koeln.de/classic/predictions/catalog/catdir.html',
'CDMS catalog table listing all available molecules (with different names from partition function).')

classic_server = _config.ConfigItem(
'https://cdms.astro.uni-koeln.de/classic',
'CDMS Classic Molecule List server.')

timeout = _config.ConfigItem(
60,
'Time limit for connecting to the CDMS server.')
Expand Down
222 changes: 208 additions & 14 deletions astroquery/linelists/cdms/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from bs4 import BeautifulSoup
import astropy.units as u
from astropy import table
from astropy.io import ascii
from astroquery.query import BaseQuery
from astroquery.utils import async_to_sync
Expand All @@ -26,8 +27,11 @@ def data_path(filename):
@async_to_sync
class CDMSClass(BaseQuery):
# use the Configuration Items imported from __init__.py
URL = conf.server
URL = conf.search
SERVER = conf.server
CLASSIC_URL = conf.classic_server
TIMEOUT = conf.timeout
MALFORMATTED_MOLECULE_LIST = ['017506 NH3-wHFS', '028582 H2NC', '058501 H2C2S', '064527 HC3HCN']

def query_lines_async(self, min_frequency, max_frequency, *,
min_strength=-500, molecule='All',
Expand Down Expand Up @@ -143,8 +147,6 @@ def query_lines_async(self, min_frequency, max_frequency, *,
else:
payload['Molecules'] = molecule

payload = list(payload.items())

if get_query_payload:
return payload
# BaseQuery classes come with a _request method that includes a
Expand All @@ -170,6 +172,13 @@ def query_lines_async(self, min_frequency, max_frequency, *,
response2 = self._request(method='GET', url=fullurl,
timeout=self.TIMEOUT, cache=cache)

# accounts for three formats, e.g.: '058501' or 'H2C2S' or '058501 H2C2S'
badlist = (self.MALFORMATTED_MOLECULE_LIST + # noqa
[y for x in self.MALFORMATTED_MOLECULE_LIST for y in x.split()])
if payload['Molecules'] in badlist:
raise ValueError(f"Molecule {payload['Molecules']} is known not to comply with standard CDMS format. "
f"Try get_molecule({payload['Molecules']}) instead.")

return response2

def _parse_result(self, response, *, verbose=False):
Expand Down Expand Up @@ -278,8 +287,9 @@ def _parse_result(self, response, *, verbose=False):

return result

def get_species_table(self, *, catfile='catdir.cat', use_cached=True,
catfile_url=conf.catfile_url):
def get_species_table(self, *, catfile='partfunc.cat', use_cached=True,
catfile_url=conf.catfile_url,
catfile2='catdir.cat', catfile_url2=conf.catfile_url2):
"""
A directory of the catalog is found in a file called 'catdir.cat.'
Expand All @@ -302,9 +312,35 @@ def get_species_table(self, *, catfile='catdir.cat', use_cached=True,
"""

if use_cached:
result = ascii.read(data_path(catfile), format='fixed_width', delimiter='|')
try:
result = ascii.read(data_path(catfile), format='fixed_width', delimiter='|')
result2 = ascii.read(data_path(catfile2), format='fixed_width', delimiter='|')
except UnicodeDecodeError:
with open(data_path(catfile), 'rb') as fh:
content = fh.read()
text = content.decode('ascii', errors='replace')
result = ascii.read(text, format='basic', delimiter='|')
with open(data_path(catfile2), 'rb') as fh:
content = fh.read()
text = content.decode('ascii', errors='replace')
result2 = ascii.read(text, format='basic', delimiter='|')
else:
result = retrieve_catfile(catfile_url)
result2 = retrieve_catfile2(catfile_url2)
result.write(data_path(catfile), format='ascii.fixed_width', delimiter='|', overwrite=True)
result2.write(data_path(catfile2), format='ascii.fixed_width', delimiter='|', overwrite=True)

merged = table.join(result, result2, keys=['tag'])
if not all(merged['#lines'] == merged['# lines']):
raise ValueError("Inconsistent table of molecules from CDMS.")
del merged['# lines']

# reorder columns
result = merged[['tag', 'molecule', 'Name', '#lines', 'lg(Q(1000))',
'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))', 'lg(Q(150))', 'lg(Q(75))',
'lg(Q(37.5))', 'lg(Q(18.75))', 'lg(Q(9.375))', 'lg(Q(5.000))',
'lg(Q(2.725))',
'Ver.', 'Documentation', 'Date of entry', 'Entry']]

meta = {'lg(Q(1000))': 1000.0,
'lg(Q(500))': 500.0,
Expand All @@ -331,6 +367,96 @@ def tryfloat(x):
result.meta = {'Temperature (K)': [1000., 500., 300., 225., 150., 75.,
37.5, 18.75, 9.375, 5., 2.725]}

result.add_index('tag')

return result

def get_molecule(self, molecule_id, *, cache=True):
"""
Retrieve the whole molecule table for a given molecule id
"""
if not isinstance(molecule_id, str) or len(molecule_id) != 6:
raise ValueError("molecule_id should be a length-6 string of numbers")
url = f'{self.CLASSIC_URL}/entries/c{molecule_id}.cat'
response = self._request(method='GET', url=url,
timeout=self.TIMEOUT, cache=cache)
result = self._parse_cat(response)

species_table = self.get_species_table()
result.meta = dict(species_table.loc[int(molecule_id)])

return result

def _parse_cat(self, response, *, verbose=False):
"""
Parse a catalog response into an `~astropy.table.Table`
See details in _parse_response; this is a very similar function,
but the catalog responses have a slightly different format.
"""

if 'Zero lines were found' in response.text:
raise EmptyResponseError(f"Response was empty; message was '{response.text}'.")

text = response.text

# notes about the format
# [F13.4, 2F8.4, I2, F10.4, I3, I7, I4, 12I2]: FREQ, ERR, LGINT, DR, ELO, GUP, TAG, QNFMT, QN noqa
# 13 21 29 31 41 44 51 55 57 59 61 63 65 67 69 71 73 75 77 79 noqa
starts = {'FREQ': 0,
'ERR': 14,
'LGINT': 22,
'DR': 30,
'ELO': 32,
'GUP': 42,
'TAG': 45,
'QNFMT': 52,
'Q1': 56,
'Q2': 58,
'Q3': 60,
'Q4': 62,
'Q5': 64,
'Q6': 66,
'Q7': 68,
'Q8': 70,
'Q9': 72,
'Q10': 74,
'Q11': 76,
'Q12': 78,
'Q13': 80,
'Q14': 82,
}

result = ascii.read(text, header_start=None, data_start=0,
comment=r'THIS|^\s{12,14}\d{4,6}.*',
names=list(starts.keys()),
col_starts=list(starts.values()),
format='fixed_width', fast_reader=False)

# int truncates - which is what we want
result['MOLWT'] = [int(x/1e4) for x in result['TAG']]

result['FREQ'].unit = u.MHz
result['ERR'].unit = u.MHz

result['Lab'] = result['MOLWT'] < 0
result['MOLWT'] = np.abs(result['MOLWT'])
result['MOLWT'].unit = u.Da

fix_keys = ['GUP']
for suf in '':
for qn in (f'Q{ii}' for ii in range(1, 15)):
qnind = qn+suf
fix_keys.append(qnind)
for key in fix_keys:
if not np.issubdtype(result[key].dtype, np.integer):
intcol = np.array(list(map(parse_letternumber, result[key])),
dtype=int)
result[key] = intcol

result['LGINT'].unit = u.nm**2 * u.MHz
result['ELO'].unit = u.cm**(-1)

return result


Expand Down Expand Up @@ -375,10 +501,13 @@ def find(self, st, flags):
Returns
-------
The list of values corresponding to the matches
The dictionary containing only values whose keys match the regex
"""

if st in self:
return {st: self[st]}

out = {}

for kk, vv in self.items():
Expand All @@ -394,24 +523,89 @@ def find(self, st, flags):
def build_lookup():

result = CDMS.get_species_table()

# start with the 'molecule' column
keys = list(result['molecule'][:]) # convert NAME column to list
values = list(result['tag'][:]) # convert TAG column to list
dictionary = dict(zip(keys, values)) # make k,v dictionary

# repeat with the Name column
keys = list(result['Name'][:])
values = list(result['tag'][:])
dictionary2 = dict(zip(keys, values))
dictionary.update(dictionary2)

lookuptable = Lookuptable(dictionary) # apply the class above

return lookuptable


def retrieve_catfile(url='https://cdms.astro.uni-koeln.de/classic/entries/partition_function.html'):
def retrieve_catfile(url=f'{conf.classic_server}/entries/partition_function.html'):
"""
Simple retrieve index function
"""
response = requests.get(url)
response.raise_for_status()
tbl = ascii.read(response.text, header_start=None, data_start=15, data_end=-5,
names=['tag', 'molecule', '#lines', 'lg(Q(1000))', 'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))',
'lg(Q(150))', 'lg(Q(75))', 'lg(Q(37.5))', 'lg(Q(18.75))', 'lg(Q(9.375))', 'lg(Q(5.000))',
'lg(Q(2.725))'],
col_starts=(0, 7, 34, 41, 53, 66, 79, 92, 106, 117, 131, 145, 159, 173),
format='fixed_width', delimiter=' ')
lines = response.text.split("\n")

# used to convert '---' to nan
def tryfloat(x):
try:
return float(x)
except ValueError:
return np.nan

# the 'fixed width' table reader fails because there are rows that violate fixed width
tbl_rows = []
for row in lines[15:-5]:
split = row.split()
tag = int(split[0])
molecule_and_lines = row[7:41]
molecule = " ".join(molecule_and_lines.split()[:-1])
nlines = int(molecule_and_lines.split()[-1])
partfunc = map(tryfloat, row[41:].split())
partfunc_dict = dict(zip(['lg(Q(1000))', 'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))',
'lg(Q(150))', 'lg(Q(75))', 'lg(Q(37.5))', 'lg(Q(18.75))',
'lg(Q(9.375))', 'lg(Q(5.000))', 'lg(Q(2.725))'], partfunc))
tbl_rows.append({'tag': tag,
'molecule': molecule,
'#lines': nlines,
})
tbl_rows[-1].update(partfunc_dict)
tbl = table.Table(tbl_rows)
# tbl = ascii.read(response.text, header_start=None, data_start=15, data_end=-5,
# names=['tag', 'molecule', '#lines', 'lg(Q(1000))', 'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))',
# 'lg(Q(150))', 'lg(Q(75))', 'lg(Q(37.5))', 'lg(Q(18.75))', 'lg(Q(9.375))', 'lg(Q(5.000))',
# 'lg(Q(2.725))'],
# col_starts=(0, 7, 34, 41, 53, 66, 79, 92, 106, 117, 131, 145, 159, 173),
# format='fixed_width', delimiter=' ')
return tbl


def retrieve_catfile2(url=f'{conf.classic_server}/predictions/catalog/catdir.html'):
"""
Simple retrieve index function
"""
response = requests.get(url)
response.raise_for_status()
try:
tbl = ascii.read(response.text, format='html')
except UnicodeDecodeError:
# based on https://github.com/astropy/astropy/issues/3826#issuecomment-256113937
# which suggests to start with the bytecode content and decode with 'replace errors'
text = response.content.decode('ascii', errors='replace')
tbl = ascii.read(text, format='html')

# delete a junk column (wastes space)
del tbl['Catalog']

# for joining - want same capitalization
tbl.rename_column("Tag", "tag")

# one of these is a unicode dash, the other is a normal dash.... in theory
if 'Entry in cm–1' in tbl.colnames:
tbl.rename_column('Entry in cm–1', 'Entry')
if 'Entry in cm-1' in tbl.colnames:
tbl.rename_column('Entry in cm-1', 'Entry')

return tbl
Loading

0 comments on commit b005fc9

Please sign in to comment.