Skip to content

Commit

Permalink
Update some methods in BamFileIoUtils accept Path input (#1681)
Browse files Browse the repository at this point in the history
  • Loading branch information
takutosato authored Oct 11, 2023
1 parent c6b1fe1 commit 6d1a796
Show file tree
Hide file tree
Showing 12 changed files with 261 additions and 44 deletions.
101 changes: 64 additions & 37 deletions src/main/java/htsjdk/samtools/BamFileIoUtils.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package htsjdk.samtools;

import htsjdk.beta.exception.HtsjdkException;
import htsjdk.samtools.seekablestream.SeekablePathStream;
import htsjdk.samtools.util.BlockCompressedFilePointerUtil;
import htsjdk.samtools.util.BlockCompressedInputStream;
import htsjdk.samtools.util.BlockCompressedOutputStream;
Expand All @@ -10,13 +12,15 @@
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.Md5CalculatingOutputStream;
import htsjdk.samtools.util.RuntimeIOException;
import htsjdk.utils.ValidationUtils;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;

public class BamFileIoUtils {
Expand All @@ -32,10 +36,18 @@ public static boolean isBamFile(final File file) {
return ((file != null) && SamReader.Type.BAM_TYPE.hasValidFileExtension(file.getName()));
}

public static void reheaderBamFile(final SAMFileHeader samFileHeader, final File inputFile, final File outputFile) {
public static void reheaderBamFile(final SAMFileHeader samFileHeader, final Path inputFile, final Path outputFile) {
reheaderBamFile(samFileHeader, inputFile, outputFile, true, true);
}


/**
* Support File input types for backward compatibility. Use the same method with Path inputs below.
*/
public static void reheaderBamFile(final SAMFileHeader samFileHeader, final File inputFile, final File outputFile, final boolean createMd5, final boolean createIndex) {
reheaderBamFile(samFileHeader, IOUtil.toPath(inputFile), IOUtil.toPath(outputFile), createMd5, createIndex);
}

/**
* Copy a BAM file but replacing the header
*
Expand All @@ -45,12 +57,14 @@ public static void reheaderBamFile(final SAMFileHeader samFileHeader, final File
* @param createMd5 Whether or not to create an MD5 file for the new BAM
* @param createIndex Whether or not to create an index file for the new BAM
*/
public static void reheaderBamFile(final SAMFileHeader samFileHeader, final File inputFile, final File outputFile, final boolean createMd5, final boolean createIndex) {
public static void reheaderBamFile(final SAMFileHeader samFileHeader, final Path inputFile, final Path outputFile, final boolean createMd5, final boolean createIndex) {
ValidationUtils.nonNull(inputFile);
ValidationUtils.nonNull(outputFile);
IOUtil.assertFileIsReadable(inputFile);
IOUtil.assertFileIsWritable(outputFile);
IOUtil.assertFileIsWritable(inputFile);

try {
BlockCompressedInputStream.assertNonDefectiveFile(inputFile);
BlockCompressedInputStream.assertNonDefectivePath(inputFile);
assertSortOrdersAreEqual(samFileHeader, inputFile);

final OutputStream outputStream = buildOutputStream(outputFile, createMd5, createIndex);
Expand All @@ -65,58 +79,61 @@ public static void reheaderBamFile(final SAMFileHeader samFileHeader, final File
}
}

public static void blockCopyBamFile(final File inputFile, final OutputStream outputStream, final boolean skipHeader, final boolean skipTerminator) {
blockCopyBamFile(IOUtil.toPath(inputFile), outputStream, skipHeader, skipTerminator);
}

/**
* Copy data from a BAM file to an OutputStream by directly copying the gzip blocks
* Copy data from a BAM file to an OutputStream by directly copying the gzip blocks.
*
* @param inputFile The file to be copied
* @param inputFile The BAM file to be copied
* @param outputStream The stream to write the copied data to
* @param skipHeader If true, the header of the input file will not be copied to the output stream
* @param skipTerminator If true, the terminator block of the input file will not be written to the output stream
*/
public static void blockCopyBamFile(final File inputFile, final OutputStream outputStream, final boolean skipHeader, final boolean skipTerminator) {
FileInputStream in = null;
try {
in = new FileInputStream(inputFile);

public static void blockCopyBamFile(final Path inputFile, final OutputStream outputStream, final boolean skipHeader, final boolean skipTerminator) {
try (final SeekablePathStream in = new SeekablePathStream(inputFile)){
// a) It's good to check that the end of the file is valid and b) we need to know if there's a terminator block and not copy it if skipTerminator is true
final BlockCompressedInputStream.FileTermination term = BlockCompressedInputStream.checkTermination(inputFile);
if (term == BlockCompressedInputStream.FileTermination.DEFECTIVE)
throw new SAMException(inputFile.getAbsolutePath() + " does not have a valid GZIP block at the end of the file.");
throw new SAMException(inputFile.toUri() + " does not have a valid GZIP block at the end of the file.");

if (skipHeader) {
final long vOffsetOfFirstRecord = SAMUtils.findVirtualOffsetOfFirstRecordInBam(inputFile);
final BlockCompressedInputStream blockIn = new BlockCompressedInputStream(inputFile);
blockIn.seek(vOffsetOfFirstRecord);
final long remainingInBlock = blockIn.available();

// If we found the end of the header then write the remainder of this block out as a
// new gzip block and then break out of the while loop
if (remainingInBlock >= 0) {
final BlockCompressedOutputStream blockOut = new BlockCompressedOutputStream(outputStream, (Path)null);
IOUtil.transferByStream(blockIn, blockOut, remainingInBlock);
blockOut.flush();
// Don't close blockOut because closing underlying stream would break everything
}

long pos = BlockCompressedFilePointerUtil.getBlockAddress(blockIn.getFilePointer());
blockIn.close();
while (pos > 0) {
pos -= in.skip(pos);
// tsato: curious --- why do we need BlockCompressedInputStream at all here?
try (final BlockCompressedInputStream blockIn = new BlockCompressedInputStream(inputFile)) {
blockIn.seek(vOffsetOfFirstRecord);
final long remainingInBlock = blockIn.available();

// If we found the end of the header then write the remainder of this block out as a
// new gzip block and then break out of the while loop (tsato: update this comment)
if (remainingInBlock >= 0) {
final BlockCompressedOutputStream blockOut = new BlockCompressedOutputStream(outputStream, (Path) null);
IOUtil.transferByStream(blockIn, blockOut, remainingInBlock);
blockOut.flush();
// Don't close blockOut because closing underlying stream would break everything
}

final long pos = BlockCompressedFilePointerUtil.getBlockAddress(blockIn.getFilePointer());
blockIn.close(); // tsato: why doesn't IntelliJ say this is unnecessary?

in.seek(pos);
} catch (IOException e){
throw new HtsjdkException("Encountered an error.", e);
}
}

// Copy remainder of input stream into output stream
final long currentPos = in.getChannel().position();
final long length = inputFile.length();
final long currentPos = in.position();
final long length = Files.size(inputFile);
final long skipLast = ((term == BlockCompressedInputStream.FileTermination.HAS_TERMINATOR_BLOCK) && skipTerminator) ?
BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length : 0;
final long bytesToWrite = length - skipLast - currentPos;

IOUtil.transferByStream(in, outputStream, bytesToWrite);
} catch (final IOException ioe) {
throw new RuntimeIOException(ioe);
} finally {
CloserUtil.close(in);
}
}

Expand All @@ -140,7 +157,7 @@ public static void gatherWithBlockCopying(final List<File> bams, final File outp

for (final File f : bams) {
LOG.info(String.format("Block copying %s ...", f.getAbsolutePath()));
blockCopyBamFile(f, out, !isFirstFile, true);
blockCopyBamFile(IOUtil.toPath(f), out, !isFirstFile, true);
isFirstFile = false;
}

Expand All @@ -162,17 +179,27 @@ public static void gatherWithBlockCopying(final List<File> bams, final File outp
}

private static OutputStream buildOutputStream(final File outputFile, final boolean createMd5, final boolean createIndex) throws IOException {
OutputStream outputStream = new FileOutputStream(outputFile);
return buildOutputStream(IOUtil.toPath(outputFile), createMd5, createIndex);
}

private static OutputStream buildOutputStream(final Path outputFile, final boolean createMd5, final boolean createIndex) throws IOException {
OutputStream outputStream = Files.newOutputStream(outputFile);
if (createMd5) {
outputStream = new Md5CalculatingOutputStream(outputStream, new File(outputFile.getAbsolutePath() + ".md5"));
outputStream = new Md5CalculatingOutputStream(outputStream, IOUtil.addExtension(outputFile, FileExtensions.MD5));
}
if (createIndex) {
outputStream = new StreamInflatingIndexingOutputStream(outputStream, new File(outputFile.getParentFile(), IOUtil.basename(outputFile) + FileExtensions.BAI_INDEX));
outputStream = new StreamInflatingIndexingOutputStream(outputStream, outputFile.resolveSibling(outputFile.getFileName() + FileExtensions.BAI_INDEX));
}
return outputStream;
}


@Deprecated
private static void assertSortOrdersAreEqual(final SAMFileHeader newHeader, final File inputFile) throws IOException {
assertSortOrdersAreEqual(newHeader, IOUtil.toPath(inputFile));
}

private static void assertSortOrdersAreEqual(final SAMFileHeader newHeader, final Path inputFile) throws IOException {
final SamReader reader = SamReaderFactory.makeDefault().open(inputFile);
final SAMFileHeader origHeader = reader.getFileHeader();
final SAMFileHeader.SortOrder newSortOrder = newHeader.getSortOrder();
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/htsjdk/samtools/SAMRecordIterator.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
/**
* A general interface that adds functionality to a CloseableIterator of
* SAMRecords. Currently, this interface is implemented by iterators that
* want to validate as they are iterating that that the records in the
* want to validate as they are iterating that the records in the
* underlying SAM/BAM file are in a particular order.
*/
public interface SAMRecordIterator extends CloseableIterator<SAMRecord> {
Expand Down
11 changes: 11 additions & 0 deletions src/main/java/htsjdk/samtools/SAMUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
*/
package htsjdk.samtools;

import htsjdk.samtools.seekablestream.SeekablePathStream;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.util.BinaryCodec;
import htsjdk.samtools.util.CigarUtil;
Expand All @@ -36,6 +37,7 @@
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.math.BigInteger;
import java.nio.file.Path;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
Expand Down Expand Up @@ -677,6 +679,15 @@ public static int combineMapqs(int m1, int m2) {

}


public static long findVirtualOffsetOfFirstRecordInBam(final Path bamFile) {
try (SeekableStream ss = new SeekablePathStream(bamFile)){
return BAMFileReader.findVirtualOffsetOfFirstRecord(ss);
} catch (final IOException ioe) {
throw new RuntimeEOFException(ioe);
}
}

/**
* Returns the virtual file offset of the first record in a BAM file - i.e. the virtual file
* offset after skipping over the text header and the sequence records.
Expand Down
12 changes: 8 additions & 4 deletions src/main/java/htsjdk/samtools/SamFileValidator.java
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.nio.file.Path;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.Collection;
Expand Down Expand Up @@ -209,20 +210,23 @@ public boolean validateSamFileVerbose(final SamReader samReader, final Reference
}

public void validateBamFileTermination(final File inputFile) {
validateBamFileTermination(IOUtil.toPath(inputFile));
}

public void validateBamFileTermination(final Path inputFile) {
try {
if (!IOUtil.isBlockCompressed(inputFile.toPath())) {
if (!IOUtil.isBlockCompressed(inputFile)) {
return;
}
final BlockCompressedInputStream.FileTermination terminationState =
BlockCompressedInputStream.checkTermination(inputFile);
if (terminationState.equals(BlockCompressedInputStream.FileTermination.DEFECTIVE)) {
addError(new SAMValidationError(Type.TRUNCATED_FILE, "BAM file has defective last gzip block",
inputFile.getPath()));
inputFile.toUri().toString()));
} else if (terminationState.equals(BlockCompressedInputStream.FileTermination.HAS_HEALTHY_LAST_BLOCK)) {
addError(new SAMValidationError(Type.BAM_FILE_MISSING_TERMINATOR_BLOCK,
"Older BAM file -- does not have terminator block",
inputFile.getPath()));

inputFile.toUri().toString()));
}
} catch (IOException e) {
throw new SAMException("IOException", e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import htsjdk.samtools.seekablestream.SeekableBufferedStream;
import htsjdk.samtools.seekablestream.SeekableFileStream;
import htsjdk.samtools.seekablestream.SeekableHTTPStream;
import htsjdk.samtools.seekablestream.SeekablePathStream;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.util.zip.InflaterFactory;

Expand Down Expand Up @@ -123,6 +124,15 @@ public BlockCompressedInputStream(final File file) throws IOException {
this(file, BlockGunzipper.getDefaultInflaterFactory());
}


/**
* Equivalent constructor for Path as the one that takes a File. Supports seeking.
*/
public BlockCompressedInputStream(final Path file) throws IOException {
this(new SeekablePathStream(file));
}


/**
* Use this ctor if you wish to call seek()
* @param file source of bytes
Expand All @@ -135,6 +145,7 @@ public BlockCompressedInputStream(final File file, final InflaterFactory inflate
blockGunzipper = new BlockGunzipper(inflaterFactory);
}


/**
* @param url source of bytes
*/
Expand Down Expand Up @@ -704,9 +715,14 @@ static void readFully(SeekableByteChannel channel, ByteBuffer dst) throws IOExce
}
}

@Deprecated
public static void assertNonDefectiveFile(final File file) throws IOException {
assertNonDefectivePath(IOUtil.toPath(file));
}

public static void assertNonDefectivePath(final Path file) throws IOException {
if (checkTermination(file) == FileTermination.DEFECTIVE) {
throw new SAMException(file.getAbsolutePath() + " does not have a valid GZIP block at the end of the file.");
throw new SAMException(file.toUri() + " does not have a valid GZIP block at the end of the file.");
}
}

Expand Down
1 change: 1 addition & 0 deletions src/main/java/htsjdk/samtools/util/FileExtensions.java
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ public final class FileExtensions {
public static final String GZI = ".gzi";
public static final String SBI = ".sbi";
public static final String CSI = ".csi";
public static final String MD5 = ".md5";

public static final Set<String> BLOCK_COMPRESSED = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(".gz", ".gzip", ".bgz", ".bgzf")));

Expand Down
15 changes: 14 additions & 1 deletion src/main/java/htsjdk/samtools/util/IOUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,18 @@ public static void assertFilesAreWritable(final List<File> files) {
for (final File file : files) assertFileIsWritable(file);
}


/**
* In some filesystems (e.g. google cloud) it may not make sense to check writability.
* This method only checks writability when it's (i.e. for now when the path points to a file
* in the local filesystem)
*/
public static void assertFileIsWritable(final Path path){ // tsato: perhaps the input type should be IOPath
if (path.toUri().getScheme().equals("file")){
IOUtil.assertFileIsWritable(path.toFile());
}
}

/**
* Checks that a directory is non-null, extent, writable and a directory
* otherwise a runtime exception is thrown.
Expand Down Expand Up @@ -867,7 +879,8 @@ public static OutputStream openFileForMd5CalculatingWriting(final File file) {
}

public static OutputStream openFileForMd5CalculatingWriting(final Path file) {
return new Md5CalculatingOutputStream(IOUtil.openFileForWriting(file), file.resolve(".md5"));
final Path digestFile = file.resolveSibling(file.getFileName() + FileExtensions.MD5);
return new Md5CalculatingOutputStream(IOUtil.openFileForWriting(file), digestFile);
}

/**
Expand Down
Loading

0 comments on commit 6d1a796

Please sign in to comment.