Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove UR:file:// and UR:ftp:// from ref search path, plus REF_PATH to EBI #1881

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 22 additions & 56 deletions cram/cram_io.c
Original file line number Diff line number Diff line change
Expand Up @@ -2473,6 +2473,9 @@ static refs_t *refs_create(void) {
static BGZF *bgzf_open_ref(char *fn, char *mode, int is_md5) {
BGZF *fp;

if (strncmp(fn, "file://", 7) == 0)
fn += 7;

if (!is_md5 && !hisremote(fn)) {
char fai_file[PATH_MAX];

Expand Down Expand Up @@ -2934,30 +2937,6 @@ static void mkdir_prefix(char *path, int mode) {
*cp = '/';
}

/*
* Return the cache directory to use, based on the first of these
* environment variables to be set to a non-empty value.
*/
static const char *get_cache_basedir(const char **extra) {
char *base;

*extra = "";

base = getenv("XDG_CACHE_HOME");
if (base && *base) return base;

base = getenv("HOME");
if (base && *base) { *extra = "/.cache"; return base; }

base = getenv("TMPDIR");
if (base && *base) return base;

base = getenv("TEMP");
if (base && *base) return base;

return "/tmp";
}

/*
* Queries the M5 string from the header and attempts to populate the
* reference from this using the REF_PATH environment.
Expand All @@ -2971,31 +2950,12 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
sam_hrec_tag_t *tag;
char path[PATH_MAX];
kstring_t path_tmp = KS_INITIALIZE;
char cache[PATH_MAX], cache_root[PATH_MAX];
char *local_cache = getenv("REF_CACHE");
mFILE *mf;
int local_path = 0;

hts_log_info("Running cram_populate_ref on fd %p, id %d", (void *)fd, id);

cache_root[0] = '\0';

if (!ref_path || *ref_path == '\0') {
/*
* If we have no ref path, we use the EBI server.
* However to avoid spamming it we require a local ref cache too.
*/
ref_path = "https://www.ebi.ac.uk/ena/cram/md5/%s";
if (!local_cache || *local_cache == '\0') {
const char *extra;
const char *base = get_cache_basedir(&extra);
snprintf(cache_root, PATH_MAX, "%s%s/hts-ref", base, extra);
snprintf(cache,PATH_MAX, "%s%s/hts-ref/%%2s/%%2s/%%s", base, extra);
local_cache = cache;
hts_log_info("Populating local cache: %s", local_cache);
}
}

if (!r->name)
return -1;

Expand All @@ -3009,7 +2969,10 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {

/* Use cache if available */
if (local_cache && *local_cache) {
if (expand_cache_path(path, local_cache, tag->str+3) == 0)
struct stat sb;
if (expand_cache_path(path, local_cache, tag->str+3) == 0 &&
stat(path, &sb) == 0)
// Found it in the local cache
local_path = 1;
}

Expand Down Expand Up @@ -3053,7 +3016,8 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {


/* Otherwise search full REF_PATH; slower as loads entire file */
if ((mf = open_path_mfile(tag->str+3, ref_path, NULL))) {
int is_local = 0;
if ((mf = open_path_mfile(tag->str+3, ref_path, NULL, &is_local))) {
size_t sz;
r->seq = mfsteal(mf, &sz);
if (r->seq) {
Expand All @@ -3069,15 +3033,23 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
} else {
refs_t *refs;
const char *fn;
sam_hrec_tag_t *UR_tag;

no_M5:
/* Failed to find in search path or M5 cache, see if @SQ UR: tag? */
if (!(tag = sam_hrecs_find_key(ty, "UR", NULL)))
if (!(UR_tag = sam_hrecs_find_key(ty, "UR", NULL)))
return -1;

if (strstr(UR_tag->str+3, "://") &&
strncmp(UR_tag->str+3, "file:", 5) != 0) {
// Documented as omitted, but accidentally supported until now
hts_log_error("UR tags pointing to remote files are not supported");
return -1;
}

fn = (strncmp(tag->str+3, "file:", 5) == 0)
? tag->str+8
: tag->str+3;
fn = (strncmp(UR_tag->str+3, "file:", 5) == 0)
? UR_tag->str+8
: UR_tag->str+3;

if (fd->refs->fp) {
if (bgzf_close(fd->refs->fp) != 0)
Expand Down Expand Up @@ -3108,15 +3080,9 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
}

/* Populate the local disk cache if required */
if (local_cache && *local_cache) {
if (!is_local && local_cache && *local_cache) {
hFILE *fp;

if (*cache_root && !is_directory(cache_root)) {
hts_log_warning("Creating reference cache directory %s\n"
"This may become large; see the samtools(1) manual page REF_CACHE discussion",
cache_root);
}

if (expand_cache_path(path, local_cache, tag->str+3) < 0) {
return 0; // Not fatal - we have the data already so keep going.
}
Expand Down
17 changes: 13 additions & 4 deletions cram/open_trace_file.c
Original file line number Diff line number Diff line change
Expand Up @@ -324,14 +324,21 @@ static mFILE *find_file_dir(const char *file, char *dirname) {
* all of the locations listed in 'path' (which is a colon separated list).
* If 'path' is NULL it uses the RAWDATA environment variable instead.
*
* If non-NULL *local is filled out to 1 for a local file and 0 for a remote
* URL.
*
* Returns a mFILE pointer when found.
* NULL otherwise.
*/
mFILE *open_path_mfile(const char *file, char *path, char *relative_to) {
mFILE *open_path_mfile(const char *file, char *path, char *relative_to,
int *local) {
char *newsearch;
char *ele;
mFILE *fp;

if (local)
*local = 1;

/* Use path first */
if (!path)
path = getenv("RAWDATA");
Expand Down Expand Up @@ -361,14 +368,16 @@ mFILE *open_path_mfile(const char *file, char *path, char *relative_to) {

if (0 == strncmp(ele2, "URL=", 4)) {
if ((fp = find_file_url(file, ele2+4))) {
if (local)
*local = strncmp(ele2+4, "file:", 5) == 0 ? 1 : 0;
free(newsearch);
return fp;
}
} else if (!strncmp(ele2, "http:", 5) ||
!strncmp(ele2, "https:", 6) ||
!strncmp(ele2, "ftp:", 4)) {
} else if (hisremote(ele2)) {
if ((fp = find_file_url(file, ele2))) {
free(newsearch);
if (local)
*local = 0;
return fp;
}
} else if ((fp = find_file_dir(file, ele2))) {
Expand Down
6 changes: 5 additions & 1 deletion cram/open_trace_file.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,14 @@ char *tokenise_search_path(const char *searchpath);
* all of the locations listed in 'path' (which is a colon separated list).
* If 'path' is NULL it uses the RAWDATA environment variable instead.
*
* If non-NULL *local is filled out to 1 for a local file and 0 for a remote
* URL.
*
* Returns a mFILE pointer when found.
* NULL otherwise.
*/
mFILE *open_path_mfile(const char *file, char *path, char *relative_to);
mFILE *open_path_mfile(const char *file, char *path, char *relative_to,
int *local);

/*
* Returns a mFILE containing the entire contents of the url;
Expand Down