diff --git a/.gitignore b/.gitignore index 1e659c0..2b989c0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,13 +1,16 @@ .idea/workspace.xml *.a -test/Testing -CMakeFiles *.cmake *.cbp -Makefile +*.fastq +*.fq CMakeCache.txt +CMakeFiles +Makefile Testing -cmake-build-debug/ +release +debug +cmake-build-* src/fastqindex test/testapp - +test/Testing diff --git a/src/ActualRunner.cpp b/src/ActualRunner.cpp index 9774f83..6a47b88 100644 --- a/src/ActualRunner.cpp +++ b/src/ActualRunner.cpp @@ -17,25 +17,16 @@ using experimental::filesystem::path; ActualRunner::ActualRunner(const path &fastqfile, const path &indexfile) { this->fastqFile = make_shared(fastqfile); this->indexFile = indexfile; - debug(string( - "Created runner with fastq file: \"" + fastqfile.string() + "\" and index \"" + indexfile.string() + "\"")); } ActualRunner::ActualRunner(istream *fastqStream, const path &indexfile) { this->fastqFile = make_shared(fastqStream); this->indexFile = indexfile; - debug(string("Created runner with input stream and index \"" + indexfile.string() + "\"")); } ActualRunner::ActualRunner(const shared_ptr &fastqfile, const path &indexfile) { this->fastqFile = fastqfile; this->indexFile = indexfile; - if (fastqfile->isStreamSource()) - debug(string("Created runner with input stream and index \"" + indexfile.string() + "\"")); - else - debug(string("Created runner with fastq file: \"" + - dynamic_pointer_cast(fastqfile)->getPath().string() + - "\" and index \"" + indexfile.string() + "\"")); } bool ActualRunner::checkPremises() { diff --git a/src/Extractor.cpp b/src/Extractor.cpp index a7da385..3c34e0a 100644 --- a/src/Extractor.cpp +++ b/src/Extractor.cpp @@ -36,7 +36,8 @@ Extractor::Extractor( this->indexReader = make_shared(indexfile); this->resultFile = resultfile; this->forceOverwrite = forceOverwrite; - this->extractionMulitplier = extractionMulitplier == 0 ? 4 : extractionMulitplier; + this->extractionMultiplier = extractionMulitplier == 0 ? 4 : extractionMulitplier; + this->roundtripBuffer = new string[extractionMulitplier]; if (resultFile.empty() || resultFile.generic_u8string() == "-") { this->forceOverwrite = false; @@ -48,6 +49,8 @@ Extractor::Extractor( } } +Extractor::~Extractor() { delete[] roundtripBuffer; } + bool Extractor::checkPremises() { if (lineCount == 0) { addErrorMessage("Can't extract a line count of 0 lines. The value needs to be a positive number."); @@ -98,12 +101,6 @@ bool Extractor::extract() { out = &outfilestream; } - if (startingLine >= extractionMulitplier) { - startingLine -= extractionMulitplier; - skipLines = extractionMulitplier; - lineCount += extractionMulitplier; - } - timerStart(); shared_ptr previousEntry = indexReader->readIndexEntry(); @@ -117,7 +114,6 @@ bool Extractor::extract() { break; } startingIndexLine = entry; -// previousEntry = entry; } timerRestart("Index entry search"); @@ -185,7 +181,7 @@ bool Extractor::extract() { if (!decompressNextChunkOfData(checkForStreamEnd, Z_NO_FLUSH)) break; - if (!processDecompressedData(out, startingIndexLine)) + if (!processDecompressedChunkOfData(out, currentDecompressedBlock.str(), startingIndexLine)) continue; // Tell the extractor, that the inner loop was called at least once, so we don't remove the first line of @@ -242,67 +238,72 @@ bool Extractor::checkAndPrepareForNextConcatenatedPart(bool finalAbort) { return true; } -bool Extractor::processDecompressedData(ostream *out, const shared_ptr &startingIndexLine) { - vector splitLines; - string str = currentDecompressedBlock.str(); - splitLines = splitStr(str); +bool +Extractor::processDecompressedChunkOfData(ostream *out, string str, const shared_ptr &startingIndexLine) { + if (extractedLines >= lineCount) + return false; + vector splitLines = splitStr(str); totalSplitCount += splitLines.size(); + // In the case, that we invoke this method the first time, the index entry + bool removeIncompleteFirstLine = firstPass && startingIndexLine->offsetOfFirstValidLine > 0; + if (removeIncompleteFirstLine) splitLines.erase(splitLines.begin()); + firstPass = false; + + // Strip away incomplete last line, store this line for the next block. string curIncompleteLastLine; - // Strip away incomplete line, store this line for the next block. - string lastSplitLine; - if (!splitLines.empty()) - lastSplitLine = splitLines[splitLines.size() - 1]; + char lastChar{0}; - if (!lastSplitLine.empty()) - lastChar = lastSplitLine.c_str()[lastSplitLine.size() - 1]; + if (!str.empty()) + lastChar = str[str.size() - 1]; if (lastChar != '\n') { - curIncompleteLastLine = lastSplitLine; - if (!splitLines.empty()) + if (!splitLines.empty()) { + curIncompleteLastLine = splitLines[splitLines.size() - 1]; splitLines.pop_back(); + } totalSplitCount--; } + // Even if the split string fails, there could still be a newline in the string. Add this and continue. if (splitLines.empty()) { - incompleteLastLine = incompleteLastLine + curIncompleteLastLine; + incompleteLastLine += curIncompleteLastLine; return false; } - // Two options. Extraction began earlier OR we are processing another chunk of data. - u_int64_t iStart = 0; - if (extractedLines == 0) { - if (startingIndexLine->offsetOfFirstValidLine > 0 && firstPass) { - if (!splitLines.empty()) splitLines.erase(splitLines.begin()); - } - iStart = skip; - } else { - if (!incompleteLastLine.empty() && extractedLines < lineCount) { - storeOrOutputLine(out, &skipLines, incompleteLastLine + splitLines[0]); + bool result = true; + // Basically two cases, first case, we have enough data here and can output something, or we skip the whole chunk + if (skip >= splitLines.size()) { // Ignore + result = false; + } else { // Output + u_int64_t iStart = skip; + if (iStart == 0) { // We start right away, also use incompleteLastLine. + storeOrOutputLine(out, incompleteLastLine + splitLines[0]); extractedLines++; iStart = 1; } - } - // iStart is 0 or 1 - for (int i = iStart; i < splitLines.size() && extractedLines < lineCount; ++i) { - storeOrOutputLine(out, &skipLines, splitLines[i]); - extractedLines++; + // iStart is 0 or 1 + for (int i = iStart; i < splitLines.size() && extractedLines < lineCount; ++i) { + storeOrOutputLine(out, splitLines[i]); + extractedLines++; + } } incompleteLastLine = curIncompleteLastLine; if (skip > 0) skip -= min(splitLines.size(), skip); - return true; + return result; } -void Extractor::storeOrOutputLine(ostream *outStream, uint64_t *skipLines, string line) { - if (*skipLines > 0) { - (*skipLines)--; - debug(string("skip line: ") + line); - } else { - if (enableDebugging) - storedLines.emplace_back(line); - else - (*outStream) << line << "\n"; - } +void Extractor::storeOrOutputLine(ostream *outStream, string line) { + roundtripBufferPosition = extractedLines % extractionMultiplier; + roundtripBuffer[roundtripBufferPosition] = line; +// if (roundtripBufferPosition != extractionMultiplier - 1) return; + +// for (int i = 0; i < extractionMultiplier; i++) { + if (enableDebugging) + storedLines.emplace_back(line); + else + (*outStream) << line << "\n"; +// } } void Extractor::storeLinesOfCurrentBlockForDebugMode() { diff --git a/src/Extractor.h b/src/Extractor.h index 46ad3fc..92df715 100644 --- a/src/Extractor.h +++ b/src/Extractor.h @@ -31,7 +31,7 @@ class Extractor : public ZLibBasedFASTQProcessorBaseClass { u_int64_t lineCount; - uint extractionMulitplier; + uint extractionMultiplier; path resultFile; @@ -55,10 +55,9 @@ class Extractor : public ZLibBasedFASTQProcessorBaseClass { // Keep track of all split lines. Merely for debugging u_int64_t totalSplitCount = 0; - /** - * Lines to skip, when the extraction starts. This is for a bugfix... - */ - uint64_t skipLines = 0; + string *roundtripBuffer = nullptr; + + uint roundtripBufferPosition = 0; public: @@ -75,13 +74,31 @@ class Extractor : public ZLibBasedFASTQProcessorBaseClass { bool forceOverwrite, u_int64_t startingLine, u_int64_t lineCount, uint extractionMulitplier, bool enableDebugging); - virtual ~Extractor() = default; + virtual ~Extractor();; /** * Will call tryOpenAndReadHeader on the internal indexReader. */ bool checkPremises(); + /** + * For debugging and testing , will be overriden by extract(). Sets the initial amount of lines which will be + * omitted. + * TODO Move to test-aware subclass. + */ + void setSkip(uint skip) { + this->skip = skip; + } + + /** + * For debugging and testing , will be overriden by extract(). Sets the initial amount of lines which will be + * omitted. + * TODO Move to test-aware subclass. + */ + void setFirstPass(bool firstPass) { + this->firstPass = firstPass; + } + /** * For now directly to cout? * @param start @@ -89,11 +106,21 @@ class Extractor : public ZLibBasedFASTQProcessorBaseClass { */ bool extract(); - bool processDecompressedData(ostream *out, const shared_ptr &startingIndexLine); + /** + * Process a decompressed chunk (NOT a complete decompressed block!) of data. + * + * Will reset the firstPass variable, if called for the first time. + * + * @param out The stream to put the data to + * @param str The string of decompressed chunk data + * @param startingIndexLine The index entry which was used to step into the gzip file. + * @return true, if something was written out or false otherwise. + */ + bool processDecompressedChunkOfData(ostream *out, string str, const shared_ptr &startingIndexLine); bool checkAndPrepareForNextConcatenatedPart(bool finalAbort); - void storeOrOutputLine(ostream *outStream, uint64_t *skipLines, string line); + void storeOrOutputLine(ostream *outStream, string line); void storeLinesOfCurrentBlockForDebugMode(); diff --git a/src/IndexWriter.cpp b/src/IndexWriter.cpp index c7ca099..3f2490f 100644 --- a/src/IndexWriter.cpp +++ b/src/IndexWriter.cpp @@ -64,7 +64,6 @@ bool IndexWriter::writeIndexHeader(const shared_ptr &header) { } bool IndexWriter::writeIndexEntry(const shared_ptr &entry) { - debug("Write index entry to index file."); if (!this->writerIsOpen) { // Throw assertion errors? Would actually be better right? addErrorMessage("Could not write index entry to index file, writer is not open."); diff --git a/src/Indexer.cpp b/src/Indexer.cpp index c97fae3..e1b1eb9 100644 --- a/src/Indexer.cpp +++ b/src/Indexer.cpp @@ -4,8 +4,9 @@ * Distributed under the MIT License (license terms are at https://github.com/dkfz-odcf/FastqIndEx/blob/master/LICENSE.txt). */ -#include "Indexer.h" #include "ActualRunner.h" +#include "Indexer.h" +#include "IndexStatsRunner.h" #include #include #include @@ -97,7 +98,7 @@ bool Indexer::createIndex() { // If not already set, recalculate the interval for index entries. if (blockInterval == -1) { if (sizeOfFastq == -1) { - blockInterval = 512; // Like for 32GB files. + blockInterval = 2048; // Like for files > 128GB. } else { blockInterval = Indexer::calculateIndexBlockInterval(sizeOfFastq); } @@ -124,6 +125,11 @@ bool Indexer::createIndex() { finishedSuccessful = false; return false; } + + if (writeOutOfPartialDecompressedBlocks) { + partialBlockinfoStream.open(storageForPartialDecompressedBlocks); + } + while (keepProcessing) { do { @@ -173,6 +179,11 @@ bool Indexer::createIndex() { } } + if (writeOutOfPartialDecompressedBlocks) { + partialBlockinfoStream.flush(); + partialBlockinfoStream.close(); + } + fastqfile->close(); inflateEnd(&zStream); @@ -264,48 +275,73 @@ void Indexer::finalizeProcessingForCurrentBlock(stringstream ¤tDecompresse // String representation of currentDecompressedBlock. Might or might not start with a fresh line, we need to // figure this out. - std::vector lines; + u_int32_t numberOfLinesInBlock{0}; string currentBlockString = currentDecompressedBlock.str(); + std::vector lines = splitStr(currentBlockString); + bool currentBlockEndedWithNewLine{false}; + bool blockIsEmpty = currentBlockString.empty(); + + if (writeOutOfDecompressedBlocksAndStatistics) { + path outfile = + storageForDecompressedBlocks.u8string() + string("/") + "decompressedblock_" + to_string(blockID) + + ".txt"; + ofstream blockStream; + blockStream.open(outfile); + blockStream << currentBlockString; + blockStream.flush(); + blockStream.close(); + } - lines = splitStr(currentBlockString); - u_int32_t sizeOfCurrentBlock = currentBlockString.size(); - u_int32_t numberOfLinesInBlock = lines.size(); + shared_ptr entry = createIndexEntryFromBlockData( + currentBlockString, + lines, + blockOffset, + lastBlockEndedWithNewline, + ¤tBlockEndedWithNewLine, + &numberOfLinesInBlock + ); + + storeDictionaryForEntry(strm, entry); + + bool written = writeIndexEntryIfPossible(entry, lines, blockIsEmpty); + + if (writeOutOfPartialDecompressedBlocks) { + + partialBlockinfoStream << "\n---- Block: " << blockID + << "\n\tlNL: " << lastBlockEndedWithNewline + << "\n\tcNL: " << currentBlockEndedWithNewLine + << "\n\t#L: " << numberOfLinesInBlock + << "\n\toff: " << entry->offsetOfFirstValidLine + << "\n\tsl: " << entry->startingLineInEntry + << "\n\tsts: " << (postponeWrite ? "Postponed" : written ? "Written" : "Skipped") + << "\n"; + if (blockIsEmpty) { + partialBlockinfoStream << "EMPTY BLOCK!\n"; + } else if (currentBlockString.size() > 20) { + partialBlockinfoStream << "STARTING 20Byte:\n'" << currentBlockString.substr(0, 20); + partialBlockinfoStream << "'\nENDING 20Byte:\n'" + << currentBlockString.substr(currentBlockString.size() - 20, 20) << "'"; + } else { + partialBlockinfoStream << "WHOLE BLOCK DATA:\n'" << currentBlockString << "'"; + } + partialBlockinfoStream.flush(); + } - // Check the current block and see, if the last character is '\n'. If so, the blockOffsetInRawFile of the first - // line in the next IndexEntry will be 0. Otherwise, we need to find the blockOffsetInRawFile. - bool currentBlockEndedWithNewLine = - sizeOfCurrentBlock == 0 ? true : currentBlockString[sizeOfCurrentBlock - 1] == '\n'; + storeLinesOfCurrentBlockForDebugMode(currentDecompressedBlock); - // Find the first newline character to get the blockOffsetInRawFile of the line inside - ushort offsetOfFirstLine{0}; - if (currentBlockString.empty()) { - } else if (!lastBlockEndedWithNewline) { - numberOfLinesInBlock--; // If the last block ended with an incomplete line (and not '\n'), reduce this. - offsetOfFirstLine = lines[0].size() + 1; - } else if (lineCountForNextIndexEntry > 0) { - lineCountForNextIndexEntry--; - } + // Keep some info for next entry. + lastBlockEndedWithNewline = currentBlockEndedWithNewLine; - // Only store every n'th block. - // Create the index entry - auto entry = make_shared( - curBits, - blockID, - offsetOfFirstLine, - blockOffset, - lineCountForNextIndexEntry); + // This will pop up a clang-tidy warning, but as Mark Adler does it, I don't want to change it. + curBits = strm->data_type & 7; - // Compared to the original zran example, which uses two memcpy operations to retrieve the dictionary, we - // use zlibs inflateGetDictionaryMethod. This looks more clean and works, whereas I could not get the - // original memcpy operations to work. -// Bytef dictTest[WINDOW_SIZE]; // Build in later, around 60% decrease in size. -// u_int64_t compressedBytes = WINDOW_SIZE; -// compress2(dictTest, &compressedBytes, dictionaryForNextBlock, WINDOW_SIZE, 9); + clearCurrentCompressedBlock(); +} - u_int32_t copiedBytes = 0; - memcpy(entry->dictionary, dictionaryForNextBlock, WINDOW_SIZE); - inflateGetDictionary(strm, dictionaryForNextBlock, &copiedBytes); +bool Indexer::writeIndexEntryIfPossible(shared_ptr &entry, + const vector &lines, + bool blockIsEmpty) { // Only write back every nth entry. As we need the large window / dictionary of 32kb, we'll need to save // some space here. Think of large NovaSeq FASTQ files with ~200GB! We can't store approximately 3.6m index @@ -321,6 +357,7 @@ void Indexer::finalizeProcessingForCurrentBlock(stringstream ¤tDecompresse // blocks. This results in ca. 20Bytes per block. E.g. storing every 16th block with ~ 32kByte results in a // hopelessly huge index file. + auto blockOffset = entry->blockOffsetInRawFile; u_int64_t offsetOfLastEntry = 0; bool failsafeDistanceIsReached = true; if (!disableFailsafeDistance) { @@ -329,25 +366,85 @@ void Indexer::finalizeProcessingForCurrentBlock(stringstream ¤tDecompresse offsetOfLastEntry = lastStoredEntry->blockOffsetInRawFile; failsafeDistanceIsReached = (blockOffset - offsetOfLastEntry) > (failsafeDistance); } - if (blockID == 0 || (blockID % blockInterval == 0 && failsafeDistanceIsReached)) { + + bool shouldWrite = postponeWrite || blockID == 0 || (blockID % blockInterval == 0 && failsafeDistanceIsReached); + if(blockIsEmpty && shouldWrite) { + postponeWrite = true; + shouldWrite = false; + } + + if (shouldWrite) { + postponeWrite = false; if (!forbidWriteFQI) indexWriter->writeIndexEntry(entry); lastStoredEntry = entry; if (enableDebugging) { storedEntries.emplace_back(entry); +// if (lines.size() >= 2) { +// cerr << "Storing entry with last lines:\n\t" << +// lines[0] << "\n\t" << lines[1] << "\n\t"; +// } } } + return shouldWrite; +} - storeLinesOfCurrentBlockForDebugMode(currentDecompressedBlock); +void Indexer::storeDictionaryForEntry(z_stream *strm, shared_ptr entry) { + // Compared to the original zran example, which uses two memcpy operations to retrieve the dictionary, we + // use zlibs inflateGetDictionaryMethod. This looks more clean and works, whereas I could not get the + // original memcpy operations to work. + // Bytef dictTest[WINDOW_SIZE]; // Build in later, around 60% decrease in size. + // u_int64_t compressedBytes = WINDOW_SIZE; + // compress2(dictTest, &compressedBytes, dictionaryForNextBlock, WINDOW_SIZE, 9); - // Keep some info for next entry. - lastBlockEndedWithNewline = currentBlockEndedWithNewLine; - lineCountForNextIndexEntry += numberOfLinesInBlock; + u_int32_t copiedBytes = 0; + memcpy(entry->dictionary, dictionaryForNextBlock, WINDOW_SIZE); + inflateGetDictionary(strm, dictionaryForNextBlock, &copiedBytes); +} - // This will pop up a clang-tidy warning, but as Mark Adler does it, I don't want to change it. - curBits = strm->data_type & 7; +shared_ptr Indexer::createIndexEntryFromBlockData(const string ¤tBlockString, + const vector &lines, + u_int64_t &blockOffsetInRawFile, + bool lastBlockEndedWithNewline, + bool *currentBlockEndedWithNewLine, + u_int32_t *numberOfLinesInBlock) { + u_int32_t sizeOfCurrentBlock = currentBlockString.size(); + *numberOfLinesInBlock = lines.size(); - clearCurrentCompressedBlock(); + // Check the current block and see, if the last character is '\n'. If so, the blockOffsetInRawFile of the first + // line in the next IndexEntry will be 0. Otherwise, we need to find the blockOffsetInRawFile. + *currentBlockEndedWithNewLine = + sizeOfCurrentBlock == 0 ? lastBlockEndedWithNewline : currentBlockString[sizeOfCurrentBlock - 1] == '\n'; + bool hasAnyLineBreaks = false; + auto currentBlockCString = currentBlockString.c_str(); + for (int i = 0; i < currentBlockString.size(); i++) { + if (currentBlockCString[i] == '\n') { + hasAnyLineBreaks = true; + break; + } + } + + // Find the first newline character to get the blockOffsetInRawFile of the line inside + ushort offsetOfFirstLine{0}; + if (currentBlockString.empty()) { + } else if (!lastBlockEndedWithNewline) { + (*numberOfLinesInBlock)--; // If the last block ended with an incomplete line (and not '\n'), reduce this. + if (hasAnyLineBreaks) // See case 3.1 in testlayout. a block with a \n at any position + offsetOfFirstLine = lines[0].size() + 1; + } + //!*currentBlockEndedWithNewLine && + // Only store every n'th block. + // Create the index entry + auto entry = make_shared( + curBits, + blockID, + offsetOfFirstLine, + blockOffsetInRawFile, + lineCountForNextIndexEntry); + + lineCountForNextIndexEntry += *numberOfLinesInBlock; + + return entry; } void Indexer::storeLinesOfCurrentBlockForDebugMode(std::stringstream ¤tDecompressedBlock) { @@ -376,4 +473,4 @@ vector Indexer::getErrorMessages() { return mergeToNewVector(l, r); } return ErrorAccumulator::getErrorMessages(); -} +} \ No newline at end of file diff --git a/src/Indexer.h b/src/Indexer.h index 4c2c0d6..3b40df3 100644 --- a/src/Indexer.h +++ b/src/Indexer.h @@ -58,6 +58,20 @@ class Indexer : public ZLibBasedFASTQProcessorBaseClass { */ vector> storedEntries; + /** + * For debug and test purposes, used when decompressed blocks are processed. Will store them + * to a file next to the + */ + bool writeOutOfDecompressedBlocksAndStatistics{false}; + + path storageForDecompressedBlocks; + + bool writeOutOfPartialDecompressedBlocks{false}; + + path storageForPartialDecompressedBlocks; + + ofstream partialBlockinfoStream; + /** * Current bits for the next index entry. */ @@ -75,6 +89,12 @@ class Indexer : public ZLibBasedFASTQProcessorBaseClass { */ bool lastBlockEndedWithNewline = true; + /** + * Empty blocks occur a lot and will cause trouble, if they get written to an index file. To overcome this, we skip + * empty blocks until the next valid block. + */ + bool postponeWrite = false; + u_int64_t lineCountForNextIndexEntry{0}; u_int64_t numberOfConcatenatedFiles{1}; @@ -155,6 +175,27 @@ class Indexer : public ZLibBasedFASTQProcessorBaseClass { const uint64_t getNumberOfConcatenatedFiles() { return numberOfConcatenatedFiles; } + shared_ptr createIndexEntryFromBlockData(const string ¤tBlockString, + const vector &lines, + u_int64_t &blockOffsetInRawFile, + bool lastBlockEndedWithNewline, + bool *currentBlockEndedWithNewLine, + u_int32_t *numberOfLinesInBlock); + + void storeDictionaryForEntry(z_stream *strm, shared_ptr entry); + + bool writeIndexEntryIfPossible(shared_ptr &entry, const vector &lines, bool blockIsEmpty); + + void enableWriteOutOfDecompressedBlocksAndStatistics(const path &location) { + this->writeOutOfDecompressedBlocksAndStatistics = true; + this->storageForDecompressedBlocks = location; + } + + + void enableWriteOutOfPartialDecompressedBlocks(const path &location) { + this->writeOutOfPartialDecompressedBlocks = true; + this->storageForPartialDecompressedBlocks = location.u8string() + string("/blockinfo.txt"); + } }; diff --git a/src/IndexerRunner.h b/src/IndexerRunner.h index ab8af09..3242278 100644 --- a/src/IndexerRunner.h +++ b/src/IndexerRunner.h @@ -65,6 +65,23 @@ class IndexerRunner : public ActualRunner { unsigned char run() override; vector getErrorMessages() override; + + // Facade methods +// void enableDebugging() { +// this->indexer->enableDebugging(); +// } + +// void enableForbidWriteFQI() { +// +// } + + void enableWriteOutOfDecompressedBlocksAndStatistics(const path &location) { + this->indexer->enableWriteOutOfDecompressedBlocksAndStatistics(location); + } + + void enableWriteOutOfPartialDecompressedBlocks(const path &location) { + this->indexer->enableWriteOutOfPartialDecompressedBlocks(location); + } }; diff --git a/src/PathInputSource.cpp b/src/PathInputSource.cpp index 2859a1a..944e36f 100644 --- a/src/PathInputSource.cpp +++ b/src/PathInputSource.cpp @@ -16,14 +16,12 @@ PathInputSource::~PathInputSource() { } bool PathInputSource::open() { - debug("Opening path input source from file " + this->source.string()); if (!fPointer.is_open()) fPointer.open(source); return fPointer.is_open(); } bool PathInputSource::close() { - debug("Closing path in put source."); if (fPointer.is_open()) fPointer.close(); return true; diff --git a/src/Starter.cpp b/src/Starter.cpp index 38e8016..984ffeb 100644 --- a/src/Starter.cpp +++ b/src/Starter.cpp @@ -109,7 +109,8 @@ IndexerRunner *Starter::assembleCmdLineParserForIndexAndParseOpts(int argc, cons ValueArg verbosity( "v", "verbosity", - "Sets the verbosity of the application in the range of 0 (default, less) to 3 (debug, max). Invalid values will be ignored and the default of 0 will apply. -D automatically sets the level to 3.", + string("Sets the verbosity of the application in the range of 0 (default, less) to 3 (debug, max). ")+ + "Invalid values will be ignored and the default of 0 will apply. -D automatically sets the level to 3.", false, 0, "int", cmdLineParser); @@ -135,10 +136,23 @@ IndexerRunner *Starter::assembleCmdLineParserForIndexAndParseOpts(int argc, cons SwitchArg disableFailsafeDistanceSwitch( "S", "disablefailsafedistance", - string("Disables the minimum offset-byte-distance checks for entries in the index file. The failsafe distance is calculated with (block distance) * (16kByte)"), + string("Disables the minimum offset-byte-distance checks for entries in the index file. ") + + "The failsafe distance is calculated with (block distance) * (16kByte)", cmdLineParser, false); + ValueArg storeForDecompressedBlocksArg( + "L", "storagefordecompressedblocks", + string("Tell the Indexer to store decompressed blocks to the set location.") + + " This function should be used with care and is meant for debugging. The target folder must exist.", + false, "", "string", cmdLineParser); + + ValueArg storeForPartialDecompressedBlocksArg( + "l", "storageforpartialdecompressedblocks", + string("Tell the Indexer to store partial information for decompressed blocks to the set location. ") + + "This function should be used with care and is meant for debugging. The target folder must exist.", + false, "", "string", cmdLineParser); + vector allowedMode{"index"}; ValuesConstraint allowedModesConstraint(allowedMode); UnlabeledValueArg mode("mode", "mode is index", true, "", &allowedModesConstraint); @@ -173,9 +187,18 @@ IndexerRunner *Starter::assembleCmdLineParserForIndexAndParseOpts(int argc, cons if (dbg) ErrorAccumulator::setVerbosity(3); - return new IndexerRunner(fastq, index, bi, dbg, fo, - forbidIndexWriteoutSwitch.getValue(), - disableFailsafeDistanceSwitch.getValue()); + auto runner = new IndexerRunner(fastq, index, bi, dbg, fo, + forbidIndexWriteoutSwitch.getValue(), + disableFailsafeDistanceSwitch.getValue()); + + if (storeForDecompressedBlocksArg.isSet()) + runner->enableWriteOutOfDecompressedBlocksAndStatistics(storeForDecompressedBlocksArg.getValue()); + if (storeForPartialDecompressedBlocksArg.isSet()) { + cerr << "Setting location for partial decompressed block data file to: '" + << storeForPartialDecompressedBlocksArg.getValue() << "'\n"; + runner->enableWriteOutOfPartialDecompressedBlocks(storeForPartialDecompressedBlocksArg.getValue()); + } + return runner; } ExtractorRunner *Starter::assembleCmdLineParserForExtractAndParseOpts(int argc, const char **argv) { @@ -195,7 +218,7 @@ ExtractorRunner *Starter::assembleCmdLineParserForExtractAndParseOpts(int argc, ValueArg indexFile( "i", "indexfile", string("The index file which shall be used for extraction. If the value is not set, .fqi will be") + - "to the name of the FASTQ file.", + "appended to the name of the FASTQ file.", false, "", "string", cmdLineParser); @@ -212,7 +235,7 @@ ExtractorRunner *Starter::assembleCmdLineParserForExtractAndParseOpts(int argc, ValueArg extractionmultiplier( "e", "extractionmultiplier", - string("Defines a multiplier by which the startingline parameter will be mulitplied. For FASTQ files ") + + string("Defines a multiplier by which the startingline parameter will be multiplied. For FASTQ files ") + "this is 4 (record size), but you could use 1 for e.g. regular text files.", false, 4, "int", cmdLineParser); @@ -240,7 +263,8 @@ ExtractorRunner *Starter::assembleCmdLineParserForExtractAndParseOpts(int argc, ValueArg verbosity( "v", "verbosity", - "Sets the verbosity of the application in the range of 0 (default, less) to 3 (debug, max). Invalid values will be ignored and the default of 0 will apply. -D automatically sets the level to 3.", + string("Sets the verbosity of the application in the range of 0 (default, less) to 3 (debug, max). ") + + "Invalid values will be ignored and the default of 0 will apply. -D automatically sets the level to 3.", false, 0, "int", cmdLineParser); @@ -264,13 +288,18 @@ ExtractorRunner *Starter::assembleCmdLineParserForExtractAndParseOpts(int argc, if (indexFile.getValue().empty()) _indexFile = path(fastqFile.getValue() + ".fqi"); + int _extractionMultiplier = extractionmultiplier.getValue(); + if (_extractionMultiplier <= 0) + _extractionMultiplier = 0; + return new ExtractorRunner( make_shared(argumentToPath(fastqFile)), _indexFile, argumentToPath(outFile), forceOverwrite.getValue(), - startingread.getValue() * extractionmultiplier.getValue(), - numberofreads.getValue() * extractionmultiplier.getValue(), 0, + startingread.getValue() * _extractionMultiplier, + numberofreads.getValue() * _extractionMultiplier, + _extractionMultiplier, debugSwitch.getValue() ); } diff --git a/src/StreamInputSource.cpp b/src/StreamInputSource.cpp index e4f61b3..ba08a08 100644 --- a/src/StreamInputSource.cpp +++ b/src/StreamInputSource.cpp @@ -12,12 +12,10 @@ const u_int64_t StreamInputSource::getTotalReadBytes() { } bool StreamInputSource::open() { - debug("Opening stream input source"); return true; } bool StreamInputSource::close() { - debug("Closing stream input source"); return true; } diff --git a/src/ZLibBasedFASTQProcessorBaseClass.cpp b/src/ZLibBasedFASTQProcessorBaseClass.cpp index 87f0be8..cdd8309 100644 --- a/src/ZLibBasedFASTQProcessorBaseClass.cpp +++ b/src/ZLibBasedFASTQProcessorBaseClass.cpp @@ -114,6 +114,7 @@ bool ZLibBasedFASTQProcessorBaseClass::decompressNextChunkOfData(bool checkForSt zlibResult = Z_DATA_ERROR; } if (zlibResult == Z_MEM_ERROR || zlibResult == Z_DATA_ERROR) { + cerr << "Zlib data or memory error occurred: " << zlibResult << "\n"; errorWasRaised = true; return false; } diff --git a/src/ZLibBasedFASTQProcessorBaseClass.h b/src/ZLibBasedFASTQProcessorBaseClass.h index 2c402da..68acc91 100644 --- a/src/ZLibBasedFASTQProcessorBaseClass.h +++ b/src/ZLibBasedFASTQProcessorBaseClass.h @@ -109,8 +109,8 @@ class ZLibBasedFASTQProcessorBaseClass : public ErrorAccumulator { while (std::getline(ss, item, delimiter)) { splittedStrings.push_back(item); } - if (str.c_str()[str.size() - 1] == delimiter) - splittedStrings.emplace_back(""); +// if (str.c_str()[str.size() - 1] == delimiter) +// splittedStrings.emplace_back(""); return splittedStrings; } diff --git a/test/ExtractorTest.cpp b/test/ExtractorTest.cpp index cd0db92..54a44dc 100644 --- a/test/ExtractorTest.cpp +++ b/test/ExtractorTest.cpp @@ -20,6 +20,7 @@ const char *const TEST_EXTRACTOR_CREATION = "Extractor creation"; const char *const TEST_CREATE_EXTRACTOR_AND_EXTRACT_SMALL_TO_COUT = "Combined test for index creation and extraction with the small dataset, extracts to cout."; const char *const TEST_CREATE_EXTRACTOR_AND_EXTRACT_LARGE_TO_COUT = "Combined test for index creation and extraction with the larger dataset, extracts to cout."; const char *const TEST_CREATE_EXTRACTOR_AND_EXTRACT_CONCAT_TO_COUT = "Combined test for index creation and extraction with the concatenated dataset, extracts to cout."; +const char *const TEST_PROCESS_DECOMPRESSED_DATA = "Test processDecompressedChunkOfData() with some test data files (analogous to IndexerTest::TEST_CORRECT_BLOCK_LINE_COUNTING.)"; const char *const TEST_EXTRACTOR_CHECKPREM_OVERWRITE_EXISTING = "Test fail on exsiting file with disabled overwrite."; const char *const TEST_EXTRACTOR_CHECKPREM_MISSING_NOTWRITABLE = "Test fail on non-writable result file with overwrite enabled."; const char *const TEST_EXTRACTOR_CHECKPREM_MISSING_PARENTNOTWRITABLE = "Test fail on non-writable output folder."; @@ -48,15 +49,9 @@ void runRangedExtractionTest(const path &fastq, if (!ok) return; - CHECK_EQUAL(expectedLineCount, lines.size()); + CHECK(expectedLineCount == lines.size()); - u_int64_t differences = 0; - for (int i = 0; i < min(decompressedSourceContent.size() - firstLine, lines.size()); i++) { - if (decompressedSourceContent[i + firstLine] != lines[i]) { - differences++; - } - } - CHECK_EQUAL(0, differences); + CHECK(TestResourcesAndFunctions::compareVectorContent(decompressedSourceContent, lines, firstLine)); } bool initializeComplexTest(const path &fastq, @@ -71,6 +66,7 @@ bool initializeComplexTest(const path &fastq, */ auto *indexer = new Indexer(make_shared(fastq), index, blockInterval, true, false, false, true); CHECK(indexer->checkPremises()); + indexer->enableWriteOutOfDecompressedBlocksAndStatistics(index.parent_path()); indexer->createIndex(); bool success = indexer->wasSuccessful(); CHECK(success); @@ -119,31 +115,97 @@ SUITE (INDEXER_SUITE_TESTS) { CHECK(extractor->checkPremises()); delete extractor; } +// +// TEST (TEST_EXTRACTOR_CHECKPREM_OVERWRITE_EXISTING) { +// CHECK(false); +// } +// +// TEST (TEST_EXTRACTOR_CHECKPREM_MISSING_NOTWRITABLE) { +// CHECK(false); +// } +// +// TEST (TEST_EXTRACTOR_CHECKPREM_MISSING_PARENTNOTWRITABLE) { +// CHECK(false); +// } +// +// TEST (TEST_EXTRACTOR_EXTRACT_WITH_OUTFILE) { +// TestResourcesAndFunctions res(INDEXER_SUITE_TESTS, TEST_EXTRACTOR_EXTRACT_WITH_OUTFILE); +// +// res.getResource(TEST_FASTQ_SMALL); +// +// CHECK(false); +// } +// +// TEST (TEST_EXTRACTOR_EXTRACT_WITH_EXISTINGOUTFILE) { +// // Check, that the output file size matches! +// CHECK(false); +// } - TEST (TEST_EXTRACTOR_CHECKPREM_OVERWRITE_EXISTING) { - CHECK(false); - } - - TEST (TEST_EXTRACTOR_CHECKPREM_MISSING_NOTWRITABLE) { - CHECK(false); - } - - TEST (TEST_EXTRACTOR_CHECKPREM_MISSING_PARENTNOTWRITABLE) { - CHECK(false); - } - - TEST (TEST_EXTRACTOR_EXTRACT_WITH_OUTFILE) { - TestResourcesAndFunctions res(INDEXER_SUITE_TESTS, TEST_EXTRACTOR_EXTRACT_WITH_OUTFILE); + TEST (TEST_PROCESS_DECOMPRESSED_DATA) { + TestResourcesAndFunctions res(INDEXER_SUITE_TESTS, TEST_PROCESS_DECOMPRESSED_DATA); + vector _blockData = TestResourcesAndFunctions::getTestVectorWithSimulatedBlockData(); - res.getResource(TEST_FASTQ_SMALL); + path fastq = res.getResource(TEST_FASTQ_LARGE); + path index = res.filePath("test2.fastq.gz.fqi"); + + vector> indexEntries; + + bool lastBlockEndedWithNewline = true; + Indexer indexer(make_shared(fastq), index, 1, true); + for (auto bd : _blockData) { + auto split = ZLibBasedFASTQProcessorBaseClass::splitStr(bd); + bool currentBlockEndedWithNewline; + u_int32_t numberOfLinesInBlock; + u_int64_t offset = 0; + auto entry = indexer.createIndexEntryFromBlockData(bd, split, offset, lastBlockEndedWithNewline, + ¤tBlockEndedWithNewline, + &numberOfLinesInBlock); + indexEntries.emplace_back(entry); + lastBlockEndedWithNewline = currentBlockEndedWithNewline; + } - CHECK(false); + uint startingLines[]{0, 4, 8, 12, 16, 18}; + int indexEntryID[]{0, 1, 2, 7, 9, 7}; + int startingBlockIDs[]{0, 1, 2, 7, 8, 7}; + int expectedLines[]{4, 4, 4, 4, 4, 2}; + + // The Extractor works a bit differently than the Indexer. In the Indexer, we decompress whole blocks, whereas + // in the Extractor, we decompress chunk-wise (technical reasons, the flush mode Z_BLOCK does not work). One + // problem here is, that the skip can be bigger than the decompressed chunk of data and this lead to problems + // in the past. To test this, we start with a "lower" index entry and add a skip factor for the LAST test! + + // Please also note, that the last line of the test dataset has NO newline, which can also occur in some source + // files. This line will not be found by the tested method here but will be output with a WARNING at the end of + // the extraction process. + + for (int i = 0; i < 6; i++) { + auto startingLine = startingLines[i]; + auto lineCount = 4; + auto indexEntry = indexEntries[indexEntryID[i]]; + Extractor extractor(make_shared(fastq), index, path("-"), false, startingLine, lineCount, + 4, true); + extractor.setSkip(startingLine - indexEntry->startingLineInEntry); + ostringstream outStream; + + for (int j = startingBlockIDs[i]; j < _blockData.size(); j++) { + extractor.processDecompressedChunkOfData(&outStream, _blockData[j], indexEntry->toIndexEntry()); + } + auto split = ZLibBasedFASTQProcessorBaseClass::splitStr(outStream.str()); + CHECK(extractor.getStoredLines().size() == expectedLines[i]); + } } - TEST (TEST_EXTRACTOR_EXTRACT_WITH_EXISTINGOUTFILE) { - // Check, that the output file size matches! - CHECK(false); - } + /** + * This test will check the case where an incomplete line was printed because the last incomplete line was ignored. + * Conditions: + * - Still first pass, no lines written yet + * - Found index entry starting line offset == 0 + * - Skip is 0 after the last invocation of processDecompressedData + * - lastIncomplete line is filled + */ +// TEST ("Test special case: skip in first pass is of splitLines.size() with incomplete last line.") { +// +// } TEST (TEST_CREATE_EXTRACTOR_AND_EXTRACT_SMALL_TO_COUT) { // We won't test the indexing here as it is already tested in TEST_CREATE_INDEX_SMALL @@ -180,14 +242,14 @@ SUITE (INDEXER_SUITE_TESTS) { u_int64_t linesInFastq = 160000; vector decompressedSourceContent; - if (!initializeComplexTest(fastq, index, extractedFastq, 4, linesInFastq, &decompressedSourceContent)) + if (!initializeComplexTest(fastq, index, extractedFastq, 1, linesInFastq, &decompressedSourceContent)) return; runRangedExtractionTest(fastq, index, decompressedSourceContent, 0, 2740, 2740); runRangedExtractionTest(fastq, index, decompressedSourceContent, 0, 160000, 160000); runRangedExtractionTest(fastq, index, decompressedSourceContent, 2740, 160000, 157260); - runRangedExtractionTest(fastq, index, decompressedSourceContent, 14740, 4000, 4000); + runRangedExtractionTest(fastq, index, decompressedSourceContent, 14223, 4000, 4000); for (u_int64_t i = 0, j = 0; i < 150000; i += 17500, j++) { runRangedExtractionTest(fastq, index, decompressedSourceContent, 2740 + i, 4000 + j, 4000 + j); } diff --git a/test/IndexReaderTest.cpp b/test/IndexReaderTest.cpp index 844bcc1..02e8a75 100644 --- a/test/IndexReaderTest.cpp +++ b/test/IndexReaderTest.cpp @@ -56,7 +56,7 @@ SUITE (SUITE_INDEXREADER_TESTS) { path idx = res.getResource(TEST_INDEX_SMALL); auto ir = make_shared(idx); CHECK(ir->tryOpenAndReadHeader()); - CHECK(ir->getIndexHeader()); + CHECK(ir->getErrorMessages().empty()); CHECK_EQUAL(ir->getIndicesLeft(), 1); } @@ -102,7 +102,7 @@ SUITE (SUITE_INDEXREADER_TESTS) { u_int64_t test[62] = {0}; - CHECK(header); +// CHECK(header.get() != nullptr); CHECK_EQUAL(1, header.indexWriterVersion); CHECK_EQUAL(67305985, header.magicNumber); CHECK_ARRAY_EQUAL(test, header.reserved, 62); @@ -128,7 +128,7 @@ SUITE (SUITE_INDEXREADER_TESTS) { auto header = ir->getIndexHeader(); - CHECK(header); +// CHECK(header); Bytef emptyWindow[WINDOW_SIZE]{0}; @@ -157,7 +157,7 @@ SUITE (SUITE_INDEXREADER_TESTS) { auto header = ir->getIndexHeader(); - CHECK(header); +// CHECK(header); CHECK(ir->readIndexEntryV1()); CHECK(ir->readIndexEntryV1()); CHECK(ir->readIndexEntryV1()); @@ -180,7 +180,7 @@ SUITE (SUITE_INDEXREADER_TESTS) { auto header = ir->getIndexHeader(); - CHECK(header); +// CHECK(header); CHECK_EQUAL(1, ir->getIndicesLeft()); auto entry1 = ir->readIndexEntryV1(); CHECK_EQUAL(0, ir->getIndicesLeft()); diff --git a/test/IndexerTest.cpp b/test/IndexerTest.cpp index c7f457b..3b110de 100644 --- a/test/IndexerTest.cpp +++ b/test/IndexerTest.cpp @@ -18,6 +18,7 @@ const char *const INDEXER_SUITE_TESTS = "IndexerTests"; const char *const TEST_INDEXER_CREATION = "IndexerCreation"; +const char *const TEST_CORRECT_BLOCK_LINE_COUNTING = "Test the correct counting of lines in decompressed blocks."; const char *const TEST_CREATE_INDEX = "testCreateIndex"; const char *const TEST_CREATE_INDEX_SMALL = "Test create index with small fastq test data."; const char *const TEST_CREATE_INDEX_W_STREAMED_DATA = "Test create index with streamed concatenated data"; @@ -80,9 +81,54 @@ SUITE (INDEXER_SUITE_TESTS) { CHECK_EQUAL(Indexer::INDEXER_VERSION, header->indexWriterVersion); } - // TEST ("readCompressedDataFromInputSource") <-- How to write a test? Currently its covered in the larger tests. - - // TEST ("call createIndex() twice") + TEST (TEST_CORRECT_BLOCK_LINE_COUNTING) { + TestResourcesAndFunctions res(INDEXER_SUITE_TESTS, TEST_CORRECT_BLOCK_LINE_COUNTING); + vector _blockData = TestResourcesAndFunctions::getTestVectorWithSimulatedBlockData(); + + // The vector contains IndexEntries with some expected values: line offset, starting line + // This is more to keep things clear and easily readable. + vector expectedIndexEntries; + expectedIndexEntries.emplace_back(IndexEntry(0, 0, 0, 0, 0)); + expectedIndexEntries.emplace_back(IndexEntry(0, 0, 0, 0, 3)); + expectedIndexEntries.emplace_back(IndexEntry(0, 0, 3, 0, 6)); + expectedIndexEntries.emplace_back(IndexEntry(0, 0, 0, 0, 9)); + expectedIndexEntries.emplace_back(IndexEntry(0, 0, 0, 0, 9)); + expectedIndexEntries.emplace_back(IndexEntry(0, 0, 3, 0, 9)); + expectedIndexEntries.emplace_back(IndexEntry(0, 0, 0, 0, 12)); + expectedIndexEntries.emplace_back(IndexEntry(0, 0, 2, 0, 12)); + expectedIndexEntries.emplace_back(IndexEntry(0, 0, 0, 0, 15)); + expectedIndexEntries.emplace_back(IndexEntry(0, 0, 0, 0, 15)); + expectedIndexEntries.emplace_back(IndexEntry(0, 0, 0, 0, 18)); + + u_int32_t expectedNumberOfLinesInBlock[] = {3, 3, 3, 0, 0, 3, 0, 3, 0, 3, 3}; + + // Files are not actually used. + path fastq = res.getResource(TEST_FASTQ_LARGE); + path index = res.filePath("test2.fastq.gz.fqi"); + Indexer indexer(make_shared(fastq), index, -1, true); + + bool lastBlockEndedWithNewline = true; + + for (int i = 0; i < _blockData.size(); i++) { + auto blockData = _blockData[i]; + auto split = ZLibBasedFASTQProcessorBaseClass::splitStr(blockData); + auto expectedNumberOfLines = expectedNumberOfLinesInBlock[i]; + auto expectedFirstLineOffset = expectedIndexEntries[i].offsetOfFirstValidLine; + auto expectedStartingLine = expectedIndexEntries[i].startingLineInEntry; + + uint64_t off = 0; + bool currentBlockEndedWithNewline; + u_int32_t numberOfLinesInBlock; + auto entry = indexer.createIndexEntryFromBlockData(blockData, split, off, lastBlockEndedWithNewline, + ¤tBlockEndedWithNewline, &numberOfLinesInBlock); + CHECK(entry->blockOffsetInRawFile == 0); + CHECK(entry->offsetOfFirstValidLine == expectedFirstLineOffset); + CHECK(entry->startingLineInEntry == expectedStartingLine); + CHECK(numberOfLinesInBlock == expectedNumberOfLines); + + lastBlockEndedWithNewline = currentBlockEndedWithNewline; + } + } TEST (TEST_CREATE_INDEX_CONCAT_SINGLEBLOCKS) { TestResourcesAndFunctions res(INDEXER_SUITE_TESTS, TEST_CREATE_INDEX_CONCAT_SINGLEBLOCKS); @@ -112,25 +158,12 @@ SUITE (INDEXER_SUITE_TESTS) { CHECK(1 == storedEntries.size()); CHECK(numberOfLinesInTestFASTQ == storedLineCount); - int firstDiff = -1; // This is more for debug purposes. - result = TestResourcesAndFunctions::extractGZFile(fastq, extractedFastq); CHECK_EQUAL(true, result); - ifstream strm(extractedFastq); - vector decompressedSourceContent; - string line; - while (std::getline(strm, line)) { - decompressedSourceContent.emplace_back(line); - } + vector decompressedSourceContent = TestResourcesAndFunctions::readLinesOfFile(extractedFastq); - for (int i = 0; i < std::min(storedLineCount, numberOfLinesInTestFASTQ); i++) { - if (storedLines[i] != decompressedSourceContent[i]) { - firstDiff = i; - break; - } - } - CHECK(firstDiff == -1); + CHECK(TestResourcesAndFunctions::compareVectorContent(storedLines, decompressedSourceContent)); delete indexer; } @@ -170,25 +203,12 @@ SUITE (INDEXER_SUITE_TESTS) { CHECK_EQUAL(1, storedEntries.size()); CHECK(numberOfLinesInTestFASTQ == storedLineCount); - int firstDiff = -1; // This is more for debug purposes. - result = TestResourcesAndFunctions::extractGZFile(concat, extractedFastq); CHECK_EQUAL(true, result); - ifstream strm(extractedFastq); - vector decompressedSourceContent; - string line; - while (std::getline(strm, line)) { - decompressedSourceContent.emplace_back(line); - } + vector decompressedSourceContent = TestResourcesAndFunctions::readLinesOfFile(extractedFastq); - for (int i = 0; i < std::min(storedLineCount, numberOfLinesInTestFASTQ); i++) { - if (storedLines[i] != decompressedSourceContent[i]) { - firstDiff = i; - break; - } - } - CHECK(firstDiff == -1); + CHECK(TestResourcesAndFunctions::compareVectorContent(storedLines, decompressedSourceContent)); // Why is this a pointer? Just to get access to the file on the command line. It is written if the // Indexer is delete OR enough data was written. If we do not have the pointer, the file gets written after the @@ -264,7 +284,8 @@ SUITE (INDEXER_SUITE_TESTS) { path fastq = res.getResource(string(TEST_FASTQ_LARGE)); path index = res.filePath("test2.fastq.gz.fqi"); ifstream fqStream(fastq); - IndexerRunner runner(shared_ptr(new StreamInputSource(&fqStream)), index, -1, false, false, false, true); + IndexerRunner runner(shared_ptr(new StreamInputSource(&fqStream)), index, -1, false, false, false, + true); CHECK(runner.run() == 0); CHECK(file_size(index) > 0); } @@ -377,19 +398,10 @@ SUITE (INDEXER_SUITE_TESTS) { result = TestResourcesAndFunctions::extractGZFile(fastq, extractedFastq); CHECK_EQUAL(true, result); - ifstream strm(extractedFastq); - vector decompressedSourceContent; - string line; - while (std::getline(strm, line)) { - decompressedSourceContent.emplace_back(line); - } + vector decompressedSourceContent = TestResourcesAndFunctions::readLinesOfFile(extractedFastq); CHECK_EQUAL(160000, decompressedSourceContent.size()); - for (int i = 0; i < 160000; i++) { - auto equal = decompressedSourceContent[i] == storedLines[i]; - if (!equal) - CHECK_EQUAL (true, equal); - } + CHECK(TestResourcesAndFunctions::compareVectorContent(storedLines, decompressedSourceContent)); // Now check the index file in a very simple way (extractor tests come later). We know, that there is one header // and several entries. diff --git a/test/TestResourcesAndFunctions.cpp b/test/TestResourcesAndFunctions.cpp index abddcca..12d93e8 100644 --- a/test/TestResourcesAndFunctions.cpp +++ b/test/TestResourcesAndFunctions.cpp @@ -17,6 +17,10 @@ using namespace std::experimental::filesystem; using std::experimental::filesystem::path; +mutex TestResourcesAndFunctions::staticLock; + +vector TestResourcesAndFunctions::testVectorWithSimulatedDecompressedBlockData; + TestResourcesAndFunctions::TestResourcesAndFunctions(string testSuite, string testName) { this->testSuite = move(testSuite); this->testName = move(testName); @@ -200,3 +204,60 @@ bool TestResourcesAndFunctions::createConcatenatedFile(const path &file, const p } return res; } + +vector TestResourcesAndFunctions::readLinesOfFile(const path &file) { + ifstream strm(file); + vector decompressedSourceContent; + string line; + while (std::getline(strm, line)) { + decompressedSourceContent.emplace_back(line); + } + return decompressedSourceContent; +} + +string TestResourcesAndFunctions::readFile(const path &file) { + std::ifstream t(file); + std::string str; + + t.seekg(0, std::ios::end); + str.reserve(t.tellg()); + t.seekg(0, std::ios::beg); + + str.assign((std::istreambuf_iterator(t)), std::istreambuf_iterator()); + return str; +} + +bool TestResourcesAndFunctions::compareVectorContent(const vector &reference, const vector &actual, + uint32_t referenceOffset) { + int64_t firstDiff{-1}; + for (int i = 0; i < std::min(reference.size() - referenceOffset, actual.size()); i++) { + if (reference[i + referenceOffset] != actual[i]) { + firstDiff = i; + break; + } + } + return firstDiff == -1; +} + +const vector &TestResourcesAndFunctions::getTestVectorWithSimulatedBlockData() { + lock_guard lock(TestResourcesAndFunctions::staticLock); + + if (!testVectorWithSimulatedDecompressedBlockData.empty()) + return testVectorWithSimulatedDecompressedBlockData; + + vector *v = &testVectorWithSimulatedDecompressedBlockData; + + v->emplace_back(readResourceFile("blockAndLineCalculations/block_0_complete")); + v->emplace_back(readResourceFile("blockAndLineCalculations/block_1_endopen")); + v->emplace_back(readResourceFile("blockAndLineCalculations/block_2_startendopen")); + v->emplace_back(readResourceFile("blockAndLineCalculations/block_5_empty")); + v->emplace_back(readResourceFile("blockAndLineCalculations/block_5_empty")); + v->emplace_back(readResourceFile("blockAndLineCalculations/block_3_startendopen")); + v->emplace_back(readResourceFile("blockAndLineCalculations/block_3.1_startendopen_nonewlines")); + v->emplace_back(readResourceFile("blockAndLineCalculations/block_4_startopen")); + v->emplace_back(readResourceFile("blockAndLineCalculations/block_5_empty")); + v->emplace_back(readResourceFile("blockAndLineCalculations/block_6_newlines")); + v->emplace_back(readResourceFile("blockAndLineCalculations/block_7_finalwonewlines")); + + return testVectorWithSimulatedDecompressedBlockData; +} diff --git a/test/TestResourcesAndFunctions.h b/test/TestResourcesAndFunctions.h index 32c0b20..00c73d7 100644 --- a/test/TestResourcesAndFunctions.h +++ b/test/TestResourcesAndFunctions.h @@ -11,6 +11,7 @@ #include #include #include +#include #include "TestConstants.h" using namespace std; @@ -31,6 +32,10 @@ class TestResourcesAndFunctions { mutex lock; + static mutex staticLock; + + static vector testVectorWithSimulatedDecompressedBlockData; + public: TestResourcesAndFunctions(string testSuite, string testName); @@ -55,7 +60,7 @@ class TestResourcesAndFunctions { path filePath(const string &filename); - path getResource(const string &filename); + static path getResource(const string &filename); path createEmptyFile(const string &filename); @@ -86,6 +91,23 @@ class TestResourcesAndFunctions { static bool extractGZFile(const path &file, const path &extractedFile); static bool createConcatenatedFile(const path &file, const path &result, int repetitions); + + static vector readLinesOfFile(const path &file); + + static string readFile(const path &file); + + vector readLinesOfResourceFile(const string &resourceFile) { + return readLinesOfFile(getResource(resourceFile)); + } + + static string readResourceFile(const string &resourceFile) { + return readFile(getResource(resourceFile)); + } + + static bool + compareVectorContent(const vector &reference, const vector &actual, uint32_t referenceOffset = 0); + + static const vector &getTestVectorWithSimulatedBlockData(); }; diff --git a/test/ZLibBasedFASTQProcessorBaseClassTest.cpp b/test/ZLibBasedFASTQProcessorBaseClassTest.cpp index ada9f7c..01899c9 100644 --- a/test/ZLibBasedFASTQProcessorBaseClassTest.cpp +++ b/test/ZLibBasedFASTQProcessorBaseClassTest.cpp @@ -48,7 +48,7 @@ SUITE (TEST_ZLIBBASE_SUITE) { TEST (TEST_SPLIT_STR) { auto text = string("one\ntwo\nthree\nfour\n5\n6\n7\n\n"); vector expectedVector{ - "one", "two", "three", "four", "5", "6", "7", "", "" + "one", "two", "three", "four", "5", "6", "7", "" }; vector res = ZLibBasedFASTQProcessorBaseClass::splitStr(text); CHECK_EQUAL(expectedVector.size(), res.size()); diff --git a/test/resources/blockAndLineCalculations/block_0_complete b/test/resources/blockAndLineCalculations/block_0_complete new file mode 100644 index 0000000..5d99def --- /dev/null +++ b/test/resources/blockAndLineCalculations/block_0_complete @@ -0,0 +1,3 @@ +0:complete_block +1:000 +2:000 diff --git a/test/resources/blockAndLineCalculations/block_1_endopen b/test/resources/blockAndLineCalculations/block_1_endopen new file mode 100644 index 0000000..10a244e --- /dev/null +++ b/test/resources/blockAndLineCalculations/block_1_endopen @@ -0,0 +1,3 @@ +3:endopen_block +4:000 +5:0 \ No newline at end of file diff --git a/test/resources/blockAndLineCalculations/block_2_startendopen b/test/resources/blockAndLineCalculations/block_2_startendopen new file mode 100644 index 0000000..20b529a --- /dev/null +++ b/test/resources/blockAndLineCalculations/block_2_startendopen @@ -0,0 +1,4 @@ +00 +6:startendopen_block # With part of the previous block and without \n at the end. +7:000 +8:0 \ No newline at end of file diff --git a/test/resources/blockAndLineCalculations/block_3.1_startendopen_nonewlines b/test/resources/blockAndLineCalculations/block_3.1_startendopen_nonewlines new file mode 100644 index 0000000..c227083 --- /dev/null +++ b/test/resources/blockAndLineCalculations/block_3.1_startendopen_nonewlines @@ -0,0 +1 @@ +0 \ No newline at end of file diff --git a/test/resources/blockAndLineCalculations/block_3_startendopen b/test/resources/blockAndLineCalculations/block_3_startendopen new file mode 100644 index 0000000..adad2b6 --- /dev/null +++ b/test/resources/blockAndLineCalculations/block_3_startendopen @@ -0,0 +1,4 @@ +00 +9:startendopen_block # With part of the previous block and without \n at the end. +10:000 +11:0 \ No newline at end of file diff --git a/test/resources/blockAndLineCalculations/block_4_startopen b/test/resources/blockAndLineCalculations/block_4_startopen new file mode 100644 index 0000000..991c1d5 --- /dev/null +++ b/test/resources/blockAndLineCalculations/block_4_startopen @@ -0,0 +1,4 @@ +0 +12:startopen_block # With part of the previous block and \n at the end +13:000 +14:000 diff --git a/test/resources/blockAndLineCalculations/block_5_empty b/test/resources/blockAndLineCalculations/block_5_empty new file mode 100644 index 0000000..e69de29 diff --git a/test/resources/blockAndLineCalculations/block_6_newlines b/test/resources/blockAndLineCalculations/block_6_newlines new file mode 100644 index 0000000..b28b04f --- /dev/null +++ b/test/resources/blockAndLineCalculations/block_6_newlines @@ -0,0 +1,3 @@ + + + diff --git a/test/resources/blockAndLineCalculations/block_7_finalwonewlines b/test/resources/blockAndLineCalculations/block_7_finalwonewlines new file mode 100644 index 0000000..b17df4e --- /dev/null +++ b/test/resources/blockAndLineCalculations/block_7_finalwonewlines @@ -0,0 +1,3 @@ +18:complete_block # Final block (without \n at the end!) +19:000 +20:000 \ No newline at end of file diff --git a/test/resources/blockAndLineCalculations/testlayout b/test/resources/blockAndLineCalculations/testlayout new file mode 100644 index 0000000..e1d33b1 --- /dev/null +++ b/test/resources/blockAndLineCalculations/testlayout @@ -0,0 +1,27 @@ +File layout for tests. | marks a block end/start. + +| Marks a new file + +### Data blocks +0:complete_block # 0: 3 lines, 0 offset, 0 start; +1:000 +2:000 +|3:endopen_block # 1: 3 lines, 0 offset, 3 start; Without \n at the end +4:000 +5:0|00 +6:startendopen_block # 2: 3 lines, 3 offset, 6 start; With part of the previous block and without \n at the end. +7:000 +8:0||| # 5,5: 0 lines, 0 offset, 9 start; 2 times empty files. +00 +9:startendopen_block # 3: 3 lines, 3 offset, 9 start; With part of the previous block and without \n at the end. +10:000 +11:0|0|0 # 3.1: 0 lines, 0 offset, 12 start; Start open, end open, only one line +12:startopen_block # 4: 3 lines, 2 offset, 12 start; With part of the previous block and \n at the end +13:000 +14:000 +|| # 5,6: empty block; + # 6: 3 lines, 0 offset, 15 start; File with several newlines + +|18:complete_block # 7: 3 lines, 0 offset, 18 start; Final block (without \n at the end!) +19:000 +20:000 \ No newline at end of file