Skip to content

Commit

Permalink
🚧 debugging notes
Browse files Browse the repository at this point in the history
  • Loading branch information
victorlin committed Jan 30, 2025
1 parent 9d7bee0 commit 4ada7e3
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 8 deletions.
30 changes: 22 additions & 8 deletions augur/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,10 +472,15 @@ def merge_sequences(args):
print_info(f"Reading sequences from {s!r}…")

# run instead of Popen to ensure that file contents are read in series, not in parallel.
# FIXME: this hangs in two different ways. why?
#
# 1. sometime after copyfileobj (cutoff is probably 256 * 1024 bytes, one less seq and it hangs later):
# for i in $(seq -w 1 26214); do echo -e ">s$(printf "%05d" $i)\nA"; done > sequences.fasta; augur merge --sequences sequences.fasta --output-sequences merged.fasta
#
# 2. during copyfileobj (cutoff is probably 272 * 1024 bytes; one less seq and it hangs later):
# for i in $(seq -w 1 27853); do echo -e ">s$(printf "%05d" $i)\nA"; done > sequences.fasta; augur merge --sequences sequences.fasta --output-sequences merged.fasta
read_processes.append(subprocess.run([*augur, "read-file", s], stdout=seqkit_process.stdin))

# FIXME: this hangs on a malformed XZ file. could be an issue with augur read-file, need to investigate

# Add a newline character to support FASTA files that are missing one at the end.
# This is fine because extraneous newline characters are stripped by seqkit.
print(file=seqkit_process.stdin)
Expand All @@ -489,28 +494,37 @@ def merge_sequences(args):

print_info(f"Merging sequences and writing to {args.output_sequences!r}…")

# FIXME: this hangs after input size of 64 * 1024 bytes. why?
# works:
# for i in $(seq -w 1 6553); do echo -e ">s$(printf "%05d" $i)\nA"; done > sequences.fasta; augur merge --sequences sequences.fasta --output-sequences merged.fasta
# hangs:
# for i in $(seq -w 1 6554); do echo -e ">s$(printf "%05d" $i)\nA"; done > sequences.fasta; augur merge --sequences sequences.fasta --output-sequences merged.fasta
for line in iter(seqkit_process.stderr.readline, ''):
if line.startswith("[INFO]"):
print_debug(f"[SeqKit] {line.rstrip()}")
else:
print_info(f"[SeqKit] {line.rstrip()}")

# Write merged sequences.
subprocess.run([*augur, "write-file", args.output_sequences], stdin=seqkit_process.stdout)
write_process = subprocess.run([*augur, "write-file", args.output_sequences], stdin=seqkit_process.stdout)

# Wait for the seqkit process to finish.
# FIXME: address potential deadlock issue?
# <https://docs.python.org/3/library/subprocess.html#subprocess.Popen.wait>
# Options:
# 1. iterate on seqkit_process.stdout.readline manually instead of passing above as stdin=seqkit_process.stdout?
# 2. use Popen.communicate() - a nonstarter because the data read is buffered in memory.
# 3. use asyncio - a nonstarter since it requires refactoring code beyond augur merge.
# <https://docs.python.org/3/library/subprocess.html#subprocess.Popen.communicate>
returncode = seqkit_process.wait()
# 3. use asyncio - a nonstarter since it requires refactoring code beyond augur merge.
seqkit_process.wait()

if returncode != 0 or any(process.returncode == 0 for process in read_processes):
if (any(process.returncode != 0 for process in read_processes) or
seqkit_process.returncode != 0 or
write_process.returncode != 0):
# Ideally, read-file, seqkit, and write-file errors would all be
# handled separately. However, due to the concurrent piping, there
# is no way to distinguish errors from the three commands.
# handled separately. However, due to the concurrent piping, the return
# codes are not always reflective of the process itself. For example, a
# seqkit error causes a read process's return code to be non-zero.
# FIXME: make this better?
if args.output_sequences == "-":
raise AugurError(f"Merging failed, see error(s) above. You are responsible for cleanup of any partial outputs.")
Expand Down
2 changes: 2 additions & 0 deletions augur/read_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,9 @@ def run(args):
# and like most Unix programs. See also
# <https://docs.python.org/3/library/signal.html#note-on-sigpipe>.
try:
print("running copyfileobj...", file=sys.stderr)
copyfileobj(f, sys.stdout, BUFFER_SIZE)
print("finished running copyfileobj.", file=sys.stderr)

# Force a flush so if SIGPIPE is going to happen it happens now.
sys.stdout.flush()
Expand Down

0 comments on commit 4ada7e3

Please sign in to comment.