From da000f5f35c50bbe37ed456cc1566234d4c03066 Mon Sep 17 00:00:00 2001 From: Joel Knighton Date: Wed, 15 Jan 2025 11:16:39 -0600 Subject: [PATCH] CNDB-12460: When refining PQVectors in CompactionGraph, use count of already encoded vectors rather than the max ordinal in the graph, as addition to the graph is asynchronous and some encoded vectors may not yet be indexed. --- .../cassandra/index/sai/disk/vector/CompactionGraph.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/java/org/apache/cassandra/index/sai/disk/vector/CompactionGraph.java b/src/java/org/apache/cassandra/index/sai/disk/vector/CompactionGraph.java index 572727232dea..4a8a65934328 100644 --- a/src/java/org/apache/cassandra/index/sai/disk/vector/CompactionGraph.java +++ b/src/java/org/apache/cassandra/index/sai/disk/vector/CompactionGraph.java @@ -311,9 +311,10 @@ public InsertionResult maybeAddVector(ByteBuffer term, int segmentRowId) throws trainingVectors.clear(); // don't need these anymore so let GC reclaim if it wants to // re-encode the vectors added so far + int encodedVectorCount = compressedVectors.count(); compressedVectors = new MutablePQVectors((ProductQuantization) compressor); compactionFjp.submit(() -> { - IntStream.range(0, builder.getGraph().getIdUpperBound()) + IntStream.range(0, encodedVectorCount) // FIXME parallel is disabled until 4.0.0 beta2 (encodeAndSet is not threadsafe before then) // .parallel() .forEach(i -> {