datastax · blambov · Jan 8, 2025 · Jan 8, 2025 · Jan 8, 2025 · Jan 8, 2025
diff --git a/src/java/org/apache/cassandra/cache/ChunkCache.java b/src/java/org/apache/cassandra/cache/ChunkCache.java
diff --git a/src/java/org/apache/cassandra/io/sstable/format/SortedTableWriter.java b/src/java/org/apache/cassandra/io/sstable/format/SortedTableWriter.java
@@ -282,12 +282,12 @@ protected void checkKeyOrder(DecoratedKey decoratedKey)
             throw new AssertionError("Last written key " + currentKey + " >= current key " + decoratedKey + " writing into " + getDataFile());
     }
 
-    protected void invalidateCacheAtBoundary(FileHandle dfile)
+    protected void invalidateCacheAtPreviousBoundary(FileHandle dfile, long newBoundary)
     {
-        if (lastEarlyOpenLength != 0 && dfile.dataLength() > lastEarlyOpenLength)
+        if (lastEarlyOpenLength != 0 && newBoundary > lastEarlyOpenLength)
             dfile.invalidateIfCached(lastEarlyOpenLength);
 
-        lastEarlyOpenLength = dfile.dataLength();
+        lastEarlyOpenLength = newBoundary;
     }
 
     public long getFilePointer()

diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java
@@ -203,7 +203,7 @@ public boolean openEarly(Consumer<SSTableReader> callWhenReady)
             dbuilder.withCompressionMetadata(((CompressedSequentialWriter) dataFile).open(boundary.dataLength));
         int dataBufferSize = optimizationStrategy.bufferSize(stats.estimatedPartitionSize.percentile(DatabaseDescriptor.getDiskOptimizationEstimatePercentile()));
         FileHandle dfile = dbuilder.bufferSize(dataBufferSize).complete(boundary.dataLength);
-        invalidateCacheAtBoundary(dfile);
+        invalidateCacheAtPreviousBoundary(dfile, boundary.dataLength);
         SSTableReader sstable = BigTableReader.internalOpen(descriptor,
                                                            components(), metadata,
                                                            ifile, dfile,
@@ -246,7 +246,7 @@ protected SSTableReader openFinal(SSTableReader.OpenReason openReason)
         if (compression)
             dbuilder.withCompressionMetadata(((CompressedSequentialWriter) dataFile).open(0));
         FileHandle dfile = dbuilder.bufferSize(dataBufferSize).complete();
-        invalidateCacheAtBoundary(dfile);
+        invalidateCacheAtPreviousBoundary(dfile, Long.MAX_VALUE);
         SSTableReader sstable = SSTableReader.internalOpen(descriptor,
                                                            components(),
                                                            metadata,

diff --git a/src/java/org/apache/cassandra/io/sstable/format/trieindex/PartitionIndexBuilder.java b/src/java/org/apache/cassandra/io/sstable/format/trieindex/PartitionIndexBuilder.java
@@ -21,8 +21,8 @@
 import java.util.function.Consumer;
 
 import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.io.sstable.SSTable;
 import org.apache.cassandra.io.tries.IncrementalTrieWriter;
-import org.apache.cassandra.io.tries.Walker;
 import org.apache.cassandra.io.util.FileHandle;
 import org.apache.cassandra.io.util.SequentialWriter;
 import org.apache.cassandra.utils.ByteBufferUtil;
@@ -112,7 +112,14 @@ private void refreshReadableBoundary()
 
         try (FileHandle fh = fhBuilder.complete(writer.getLastFlushOffset()))
         {
-            PartitionIndex pi = new PartitionIndexEarly(fh, partialIndexTail.root(), partialIndexTail.count(), firstKey, partialIndexLastKey, partialIndexTail.cutoff(), partialIndexTail.tail(), version);
+            PartitionIndex pi = new PartitionIndexEarly(fh,
+                                                        partialIndexTail.root(),
+                                                        partialIndexTail.count(),
+                                                        SSTable.getMinimalKey(firstKey),
+                                                        SSTable.getMinimalKey(partialIndexLastKey),
+                                                        partialIndexTail.cutoff(),
+                                                        partialIndexTail.tail(),
+                                                        version);
             partialIndexConsumer.accept(pi);
             partialIndexConsumer = null;
         }

diff --git a/src/java/org/apache/cassandra/io/sstable/format/trieindex/PartitionWriter.java b/src/java/org/apache/cassandra/io/sstable/format/trieindex/PartitionWriter.java
@@ -92,4 +92,9 @@ protected void addIndexBlock() throws IOException
         firstClustering = null;
         ++rowIndexCount;
     }
+
+    long partitionStart()
+    {
+        return initialPosition;
+    }
 }
diff --git a/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableWriter.java b/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableWriter.java
@@ -183,17 +183,19 @@ public RowIndexEntry endPartition() throws IOException
     @SuppressWarnings("resource")
     public boolean openEarly(Consumer<SSTableReader> callWhenReady)
     {
-        long dataLength = dataFile.position();
+        // Because the partition index writer is one partition behind, we want the file to stop at the start of the
+        // last partition that was written.
+        long dataLength = partitionWriter.partitionStart();
 
         return iwriter.buildPartial(dataLength, partitionIndex ->
         {
             StatsMetadata stats = statsMetadata();
             FileHandle ifile = iwriter.rowIndexFHBuilder.complete(iwriter.rowIndexFile.getLastFlushOffset());
             if (compression)
-                dbuilder.withCompressionMetadata(((CompressedSequentialWriter) dataFile).open(dataFile.getLastFlushOffset()));
+                dbuilder.withCompressionMetadata(((CompressedSequentialWriter) dataFile).open(dataLength));
             int dataBufferSize = optimizationStrategy.bufferSize(stats.estimatedPartitionSize.percentile(DatabaseDescriptor.getDiskOptimizationEstimatePercentile()));
-            FileHandle dfile = dbuilder.bufferSize(dataBufferSize).complete(dataFile.getLastFlushOffset());
-            invalidateCacheAtBoundary(dfile);
+            FileHandle dfile = dbuilder.bufferSize(dataBufferSize).complete(dataLength);
+            invalidateCacheAtPreviousBoundary(dfile, dataLength);
             SSTableReader sstable = TrieIndexSSTableReader.internalOpen(descriptor,
                                                                components(), metadata,
                                                                ifile, dfile, partitionIndex, iwriter.bf.sharedCopy(),
@@ -231,9 +233,9 @@ protected SSTableReader openFinal(SSTableReader.OpenReason openReason)
         FileHandle rowIndexFile = iwriter.rowIndexFHBuilder.complete();
         int dataBufferSize = optimizationStrategy.bufferSize(stats.estimatedPartitionSize.percentile(DatabaseDescriptor.getDiskOptimizationEstimatePercentile()));
         if (compression)
-            dbuilder.withCompressionMetadata(((CompressedSequentialWriter) dataFile).open(dataFile.getLastFlushOffset()));
+            dbuilder.withCompressionMetadata(((CompressedSequentialWriter) dataFile).open(0));
         FileHandle dfile = dbuilder.bufferSize(dataBufferSize).complete();
-        invalidateCacheAtBoundary(dfile);
+        invalidateCacheAtPreviousBoundary(dfile, Long.MAX_VALUE);
         SSTableReader sstable = TrieIndexSSTableReader.internalOpen(descriptor,
                                                             components(),
                                                             this.metadata,

diff --git a/src/java/org/apache/cassandra/io/storage/StorageProvider.java b/src/java/org/apache/cassandra/io/storage/StorageProvider.java
@@ -36,6 +36,7 @@
 import org.apache.cassandra.index.sai.disk.format.IndexComponentType;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.SSTable;
 import org.apache.cassandra.io.sstable.metadata.ZeroCopyMetadata;
 import org.apache.cassandra.io.util.File;
 import org.apache.cassandra.io.util.FileHandle;
@@ -256,14 +257,15 @@ public File createDirectory(String dir, DirectoryType type)
         public void invalidateFileSystemCache(File file)
         {
             INativeLibrary.instance.trySkipCache(file, 0, 0);
+            if (ChunkCache.instance != null)
+                ChunkCache.instance.invalidateFile(file);
         }
 
         @Override
         public void invalidateFileSystemCache(Descriptor desc, boolean tidied)
         {
-            StorageProvider.instance.invalidateFileSystemCache(desc.fileFor(Component.DATA));
-            StorageProvider.instance.invalidateFileSystemCache(desc.fileFor(Component.ROW_INDEX));
-            StorageProvider.instance.invalidateFileSystemCache(desc.fileFor(Component.PARTITION_INDEX));
+            for (Component component : SSTable.discoverComponentsFor(desc))
+                invalidateFileSystemCache(desc.fileFor(component));
         }
 
         protected Config.DiskAccessMode accessMode(Component component)

diff --git a/src/java/org/apache/cassandra/io/util/BufferManagingRebufferer.java b/src/java/org/apache/cassandra/io/util/BufferManagingRebufferer.java
@@ -28,12 +28,14 @@
 
 import org.apache.cassandra.utils.memory.BufferPools;
 
-/**
- * Buffer manager used for reading from a ChunkReader when cache is not in use. Instances of this class are
- * reader-specific and thus do not need to be thread-safe since the reader itself isn't.
- *
- * The instances reuse themselves as the BufferHolder to avoid having to return a new object for each rebuffer call.
- */
+/// Buffer manager used for reading from a [ChunkReader] when cache is not in use. They use a buffer produced by the
+/// "networking" buffer pool, which is the one to be used for buffers that are not to be retained for a long time
+/// (the lifetime of this object is contained by the lifetime of a [RandomAccessReader] which is contained in a read
+/// operation's lifetime).
+///
+/// Instances of this class are reader-specific and thus do not need to be thread-safe since the reader itself isn't.
+///
+/// The instances reuse themselves as the BufferHolder to avoid having to return a new object for each rebuffer call.
 public abstract class BufferManagingRebufferer implements Rebufferer, Rebufferer.BufferHolder
 {
     protected final ChunkReader source;
@@ -45,14 +47,20 @@ public abstract class BufferManagingRebufferer implements Rebufferer, Rebufferer
     protected BufferManagingRebufferer(ChunkReader wrapped)
     {
         this.source = wrapped;
-        buffer = BufferPools.forChunkCache().get(wrapped.chunkSize(), wrapped.preferredBufferType()).order(ByteOrder.BIG_ENDIAN);
+        // Note: This class uses the networking buffer pool which makes better sense for short-lifetime buffers.
+        // Because this is meant to be used when the chunk cache is disabled, it also makes sense to use any memory
+        // that may have been allocated for in-flight data by using the chunk-cache pool.
+        // However, if some new functionality decides to use this class in the presence of the chunk cache (e.g.
+        // cache-bypassing compaction), using the chunk-cache pool here will certainly cause hard-to-diagnose issues
+        // that we would prefer to avoid.
+        buffer = BufferPools.forNetworking().get(wrapped.chunkSize(), wrapped.preferredBufferType()).order(ByteOrder.BIG_ENDIAN);
         buffer.limit(0);
     }
 
     @Override
     public void closeReader()
     {
-        BufferPools.forChunkCache().put(buffer);
+        BufferPools.forNetworking().put(buffer);
         offset = -1;
     }
 
@@ -102,31 +110,6 @@ public ByteBuffer buffer()
         return buffer.duplicate();
     }
 
-    @Override
-    public ByteOrder order()
-    {
-        return buffer.order();
-    }
-
-    @Override
-    public FloatBuffer floatBuffer()
-    {
-        return buffer.asFloatBuffer();
-    }
-
-    @Override
-    public IntBuffer intBuffer()
-    {
-        return buffer.asIntBuffer();
-    }
-
-    @Override
-    public LongBuffer longBuffer()
-    {
-        return buffer.asLongBuffer();
-    }
-
-
     public long offset()
     {
         return offset;

diff --git a/src/java/org/apache/cassandra/io/util/ChunkReader.java b/src/java/org/apache/cassandra/io/util/ChunkReader.java
@@ -52,4 +52,17 @@ public interface ChunkReader extends RebuffererFactory
      * This is not guaranteed to be fulfilled.
      */
     BufferType preferredBufferType();
+
+    /**
+     * In some cases we may end up with both compressed and uncompressed data for the same file in
+     * the cache. This type is used to distinguish between them.
+     */
+    enum ReaderType
+    {
+        SIMPLE,
+        COMPRESSED;
+        /** The number of types. Declared as a constant to avoid allocating on values(). */
+        public static final int COUNT = ReaderType.values().length;
+    }
+    ReaderType type();
 }
diff --git a/src/java/org/apache/cassandra/io/util/CompressedChunkReader.java b/src/java/org/apache/cassandra/io/util/CompressedChunkReader.java
@@ -30,6 +30,7 @@
 import org.apache.cassandra.io.sstable.CorruptSSTableException;
 import org.apache.cassandra.io.storage.StorageProvider;
 import org.apache.cassandra.utils.ChecksumType;
+import org.apache.cassandra.utils.memory.BufferPools;
 
 public abstract class CompressedChunkReader extends AbstractReaderFileProxy implements ChunkReader
 {
@@ -91,11 +92,14 @@ public Rebufferer instantiateRebufferer()
         return new BufferManagingRebufferer.Aligned(this);
     }
 
+    public ReaderType type()
+    {
+        return ReaderType.COMPRESSED;
+    }
+
     public static class Standard extends CompressedChunkReader
     {
         // we read the raw compressed bytes into this buffer, then uncompressed them into the provided one.
-        private final ThreadLocalByteBufferHolder bufferHolder;
-
         public Standard(ChannelProxy channel, CompressionMetadata metadata)
         {
             this(channel, metadata, 0);
@@ -104,7 +108,6 @@ public Standard(ChannelProxy channel, CompressionMetadata metadata)
         public Standard(ChannelProxy channel, CompressionMetadata metadata, long startOffset)
         {
             super(channel, metadata, startOffset);
-            bufferHolder = new ThreadLocalByteBufferHolder(metadata.compressor().preferredBufferType());
         }
 
         @Override
@@ -122,57 +125,54 @@ public void readChunk(long position, ByteBuffer uncompressed)
                                             : chunk.length;
 
                 long chunkOffset = chunk.offset - onDiskStartOffset;
-                if (chunk.length < maxCompressedLength)
+                boolean shouldDecompress = chunk.length < maxCompressedLength;
+                if (shouldDecompress || shouldCheckCrc) // when we need to read the CRC too, follow the decompression path to avoid a second channel read call
                 {
-                    ByteBuffer compressed = bufferHolder.getBuffer(length);
+                    ByteBuffer compressed = BufferPools.forNetworking().getAtLeast(length, metadata.compressor().preferredBufferType());
 
-                    if (channel.read(compressed, chunkOffset) != length)
-                        throw new CorruptBlockException(channel.filePath(), chunk);
-
-                    compressed.flip();
-                    compressed.limit(chunk.length);
-                    uncompressed.clear();
-
-                    if (shouldCheckCrc)
+                    try
                     {
-                        int checksum = (int) ChecksumType.CRC32.of(compressed);
-
                         compressed.limit(length);
-                        int storedChecksum = compressed.getInt();
-                        if (storedChecksum != checksum)
-                            throw new CorruptBlockException(channel.filePath(), chunk, storedChecksum, checksum);
+                        if (channel.read(compressed, chunkOffset) != length)
+                            throw new CorruptBlockException(channel.filePath(), chunk);
 
-                        compressed.position(0).limit(chunk.length);
-                    }
+                        if (shouldCheckCrc)
+                        {
+                            // compute checksum of the compressed data
+                            compressed.position(0).limit(chunk.length);
+                            int checksum = (int) ChecksumType.CRC32.of(compressed);
+                            // the remaining bytes are the checksum
+                            compressed.limit(length);
+                            int storedChecksum = compressed.getInt();
+                            if (storedChecksum != checksum)
+                                throw new CorruptBlockException(channel.filePath(), chunk, storedChecksum, checksum);
+                        }
 
-                    try
-                    {
-                        metadata.compressor().uncompress(compressed, uncompressed);
+                        compressed.position(0).limit(chunk.length);
+                        uncompressed.clear();
+
+                        try
+                        {
+                            if (shouldDecompress)
+                                metadata.compressor().uncompress(compressed, uncompressed);
+                            else
+                                uncompressed.put(compressed);
+                        }
+                        catch (IOException e)
+                        {
+                            throw new CorruptBlockException(channel.filePath(), chunk, e);
+                        }
                     }
-                    catch (IOException e)
+                    finally
                     {
-                        throw new CorruptBlockException(channel.filePath(), chunk, e);
+                        BufferPools.forNetworking().put(compressed);
                     }
                 }
                 else
                 {
                     uncompressed.position(0).limit(chunk.length);
                     if (channel.read(uncompressed, chunkOffset) != chunk.length)
                         throw new CorruptBlockException(channel.filePath(), chunk);
-
-                    if (shouldCheckCrc)
-                    {
-                        uncompressed.flip();
-                        int checksum = (int) ChecksumType.CRC32.of(uncompressed);
-
-                        ByteBuffer scratch = bufferHolder.getBuffer(Integer.BYTES);
-
-                        if (channel.read(scratch, chunkOffset + chunk.length) != Integer.BYTES)
-                            throw new CorruptBlockException(channel.filePath(), chunk);
-                        int storedChecksum = scratch.getInt(0);
-                        if (storedChecksum != checksum)
-                            throw new CorruptBlockException(channel.filePath(), chunk, storedChecksum, checksum);
-                    }
                 }
                 uncompressed.flip();
             }
@@ -223,24 +223,22 @@ public void readChunk(long position, ByteBuffer uncompressed)
                 int chunkOffsetInSegment = Ints.checkedCast(chunk.offset - segmentOffset);
                 ByteBuffer compressedChunk = region.buffer();
 
-                compressedChunk.position(chunkOffsetInSegment).limit(chunkOffsetInSegment + chunk.length);
-
-                uncompressed.clear();
-
                 try
                 {
                     if (shouldCheckCrc())
                     {
+                        compressedChunk.position(chunkOffsetInSegment).limit(chunkOffsetInSegment + chunk.length);
                         int checksum = (int) ChecksumType.CRC32.of(compressedChunk);
 
                         compressedChunk.limit(compressedChunk.capacity());
                         int storedChecksum = compressedChunk.getInt();
                         if (storedChecksum != checksum)
                             throw new CorruptBlockException(channel.filePath(), chunk, storedChecksum, checksum);
-
-                        compressedChunk.position(chunkOffsetInSegment).limit(chunkOffsetInSegment + chunk.length);
                     }
 
+                    compressedChunk.position(chunkOffsetInSegment).limit(chunkOffsetInSegment + chunk.length);
+                    uncompressed.clear();
+
                     if (chunk.length < maxCompressedLength)
                         metadata.compressor().uncompress(compressedChunk, uncompressed);
                     else