Merge pull request #2081 from sanger/y24-376-pooling-changes

Y24-376 - Changes to donor pooling algorithm
sanger · Jan 3, 2025 · be6592d · be6592d
2 parents ca0f7e4 + cc8567a
commit be6592d
Show file tree

Hide file tree

Showing 5 changed files with 531 additions and 393 deletions.
diff --git a/app/models/concerns/labware_creators/donor_pooling_calculator.rb b/app/models/concerns/labware_creators/donor_pooling_calculator.rb
@@ -1,13 +1,21 @@
 # frozen_string_literal: true
 
+# rubocop:todo Metrics/ModuleLength
 # This module contains algorithms to allocate source wells into a target number of pools.
 module LabwareCreators::DonorPoolingCalculator
   extend ActiveSupport::Concern
 
-  # Splits wells into groups by study and project. Wells are grouped based on the
-  # study and project of the first aliquot in each well (only one aliquot is
-  # expected per well). Returns an array of groups, where each group is an array
-  # of wells with the same study and project.
+  VALID_POOL_SIZE_RANGE = Rails.application.config.scrna_config[:valid_pool_size_range]
+
+  # Splits wells into groups by study and project, because:
+  # a) no pool should contain samples from more than one study or project,
+  #    to limit the risk of data leakage between studies, and
+  # b) the requested number of pools is specified at the study/project level,
+  #    because different study/project groups could be from different customers.
+  #
+  # Wells are grouped based on the study and project of the first aliquot in each well
+  # (only one aliquot is expected per well). Returns an array of groups, where each group
+  # is an array of wells with the same study and project.
   #
   # If the input group is [w1, w2, w3, w4, w5, w6, w7, w8, w9]
   # where w1, w2, w3, w4, w5, w6, w7, w8, and w9 are wells with (study_id, project_id),
@@ -31,133 +39,115 @@ def split_single_group_by_study_and_project(group)
     group.group_by { |well| [well.aliquots.first.study.id, well.aliquots.first.project.id] }.values
   end
 
-  # Splits groups ensuring unique donor_ids within each group. Iterates over
-  # each group, creating subgroups with wells from a unique donor. The first
-  # occurrences of unique donor_ids are grouped, then the second occurrences,
-  # and so on. This prevents combining samples with the same donor_id. The
-  # result is flattened to a single array of subgroups.
-  #
-  # If the input groups are [[w1, w2, w3, w4], [w5, w6, w7], [w8, w9]]
-  # where w1, w2, w3, w4, w5, w6, w7, w8, and w9 are wells with (donor_id),
-  #
-  # w1(1)
-  # w2(2)
-  # w3(3)
-  # w4(1)
-  # w5(4)
-  # w6(4)
-  # w7(5)
-  # w8(6)
-  # w9(7)
-  #
-  # the result will be:
-  # [[w1, w2, w3], [w4], [w5, w7], [w6], [w8, w9]]
-  #
-  # Note that the input groups are not mixed. donor_ids are unique within each
-  # result subgroup.
-  #
-  # @param groups [Array<Array<Well>>] Array of well groups to be split.
-  # @return [Array<Array<Well>>] Array of subgroups split by donor ID.
-  def split_groups_by_unique_donor_ids(groups)
-    groups.flat_map { |group| split_single_group_by_unique_donor_ids(group) }
+  def validate_pool_sizes!(pools)
+    if pools.any? { |pool| !VALID_POOL_SIZE_RANGE.cover?(pool.size) }
+      raise 'Invalid distribution: Each pool must have ' \
+              "between #{VALID_POOL_SIZE_RANGE.min} and #{VALID_POOL_SIZE_RANGE.max} wells."
+    end
+
+    pool_sizes = pools.map(&:size)
+    return unless pool_sizes.max - pool_sizes.min > 1
+    raise 'Invalid distribution: Pool sizes differ by more than one.'
   end
 
-  # Splits a single group of wells by donor_ids. This method is used by the
-  # 'split_groups_by_unique_donor_ids' method. It iteratively segregates wells with
-  # the first encountered instance of each unique donor_id into a separate
-  # subgroup. This process continues until there are no wells left in the
-  # original group. The result is a collection of subgroups, each containing
-  # wells from distinct donors.
+  # rubocop:todo Metrics/AbcSize
+  # Allocates wells to pools. The wells will have grouped by study and project, and now
+  # they will be grouped by unique donor_ids. The wells will be distributed sequentially
+  # to the pools, ensuring that each pool has between 5 and 25 wells.
   #
-  # If the input group is [w1, w2, w3, w4, w5, w6, w7, w8, w9]
-  # where w1, w2, w3, w4, w5, w6, w7, w8, and w9 are wells with (donor_id),
+  # If the number of wells is 96 and the number of pools is 8, then
+  # each pool will have 12 wells
+  # [[12], [12], [12], [12], [12], [12], [12], [12]]
   #
-  # w1(1)
-  # w2(2)
-  # w3(3)
-  # w4(1)
-  # w5(2)
-  # w6(4)
-  # w7(5)
-  # w8(5)
-  # w9(5)
   #
-  # the result will be:
-  # [[w1, w2, w3, w6, w7], [w4, w5, w8], [w9]]
+  # If the number of wells is 96 and the number of pools is 7, then
+  # the first 5 pools will have 14 wells and the last 2 pools will have 13 wells
+  # [[14], [14], [14], [14], [14], [13], [13]]
   #
-  # @param group [Array<Well>] The group of wells to split.
-  # @return [Array<Array<Well>>] An array of subgroups, each containing wells
-  #   from different donors.
-  def split_single_group_by_unique_donor_ids(group)
-    group = group.dup
-    output = []
-    wells_moved = 0
-    wells_total = group.size
-    while wells_moved < wells_total
-      subgroup = []
-      unique_donor_ids(group).each do |donor_id|
-        wells_moved += 1
-        index = group.index { |well| well.aliquots.first.sample.sample_metadata.donor_id == donor_id }
-        subgroup << group.delete_at(index)
+  # If the number of wells is 24 and the number of pools is 5, then
+  # an error will be raised because each pool must have at least 5 wells
+  #
+  # @param wells [Array<Well>] The wells to be allocated to pools.
+  # @param number_of_pools [Integer] The number of pools to distribute the wells into.
+  # @return [Array<Array<Well>>] An array of pools, between 1 and 8, each containing between 5 and 25 wells.
+  def allocate_wells_to_pools(wells, number_of_pools)
+    pools = Array.new(number_of_pools) { [] }
+    used_donor_ids = Array.new(number_of_pools) { [] }
+
+    # Calculate ideal pool sizes based on the number of wells and pools
+    ideal_pool_size, remainder = wells.size.divmod(number_of_pools)
+
+    # If there's a remainder, some pools will have one more well than others
+    pool_sizes = Array.new(number_of_pools, ideal_pool_size)
+    remainder.times { |i| pool_sizes[i] += 1 }
+
+    wells = reorder_wells_by_donor_id(wells)
+
+    # Assign wells to pools
+    # Loop through the wells, and then the pools, and break out when we successfully assign a well
+    #
+    wells.each do |well|
+      assigned = false
+      donor_id = well.aliquots.first.sample.sample_metadata.donor_id
+
+      pools.each_with_index do |pool, pool_index|
+        # if this pool is full, try the next pool
+        next if pool.size >= pool_sizes[pool_index]
+
+        # this pool already contains a sample with this donor_id, try the next pool
+        next if used_donor_ids[pool_index].include?(donor_id)
+
+        # add the well to the pool, and skip to the next well to allocate
+        pool << well
+        used_donor_ids[pool_index] << donor_id
+        assigned = true
+        break
       end
-      output << subgroup
+
+      next if assigned
+
+      raise 'Cannot find a pool to assign the well to.'
     end
-    output
+
+    validate_pool_sizes!(pools)
+    validate_unique_donor_ids!(pools)
+    pools
   end
+  # rubocop:enable Metrics/AbcSize
 
-  # Returns the unique donor_ids from a group of wells. Used by the
-  # 'split_single_group_by_unique_donor_ids' method.
-  #
-  # If the input group is [w1, w2, w3, w4, w5, w6, w7, w8, w9]
-  # where w1, w2, w3, w4, w5, w6, w7, w8, and w9 are wells with (donor_id),
-  #
-  # w1(1)
-  # w2(2)
-  # w3(3)
-  # w4(1)
-  # w5(2)
-  # w6(4)
-  # w7(5)
-  # w8(5)
-  # w9(5)
-  #
-  # the result will be:
-  # [1, 2, 3, 4, 5]
-  #
-  # @param group [Array<Well>] The group of wells from which to retrieve donor_ids.
-  # @return [Array<String>] An array of unique donor_ids.
-  def unique_donor_ids(group)
-    group.map { |well| well.aliquots.first.sample.sample_metadata.donor_id }.uniq
+  # Reorder wells before splitting them into pools,
+  # so that the largest groups that share the same donor_id will be allocated to pools first.
+  # This prevents us from getting in a situation where the first pools fill up, and then we don't have enough pools
+  # left to split up a large group of wells that share the same donor_id.
+  # See test 'when the groups of donor ids are not ordered largest to smallest' in donor_pooling_calculator_spec.rb
+  def reorder_wells_by_donor_id(wells)
+    # { donor_id_1 => [wells], donor_id_2 => [wells], ... }
+    donor_id_to_wells = wells.group_by { |well| well.aliquots.first.sample.sample_metadata.donor_id }
+
+    # { donor_id_1 => [wells], donor_id_2 => [wells], ... } sorted by number of wells in each group
+    donor_id_to_wells = stable_sort_hash_by_values_size_desc(donor_id_to_wells)
+
+    # [well, well, ...] flattened back into a reordered array of wells
+    donor_id_to_wells.pluck(1).flatten
   end
 
-  # Distributes samples across pools based on group sizes. It sorts the groups
-  # by size and splits the largest group into two until the number of groups
-  # equals the number of pools or until all groups have a size of 1. The input
-  # groups are the result of applying conditions, hence they cannot be mixed.
-  #
-  # If the request number of pools is 6 and the input groups are
-  # [[1, 2, 3], [4, 5], [6, 7, 8, 9]] where the numbers denote wells,
-  #
-  # the result will be:
-  # [[3], [1], [2], [4, 5], [6, 7], [8, 9]]
+  # 'Stable sort' means the original order is maintained wherever possible.
+  # Should make pooling results more intuitive.
+  def stable_sort_hash_by_values_size_desc(the_hash)
+    the_hash.sort_by.with_index { |elem, idx| [-elem[1].size, idx] }
+  end
+
+  # Ensure that each pool contains unique donor IDs.
   #
-  # for which the steps are:
-  # [[1, 2, 3], [4, 5], [6, 7, 8, 9]] -> 3 pools (input)
-  # [[4, 5], [6, 7], [8, 9], [1, 2, 3]] -> 4 pools
-  # [[3], [4, 5], [6, 7], [8, 9], [1, 2]] -> 5 pools
-  # [[3], [1], [2], [4, 5], [6, 7], [8, 9]] -> 6 pools (output)
+  # @param pools [Array<Array<Well>>] The current pools.
   #
-  # @param groups [Array<Array<Well>>] Array of well groups to be distributed.
-  # @return [Array<Array<Well>>] Array of distributed groups.
-  def distribute_groups_across_pools(groups, number_of_pools)
-    groups = groups.dup
-    groups.sort_by!(&:size)
-    while groups.any? && groups.last.size > 1 && groups.size < number_of_pools
-      largest = groups.pop # last
-      splits = largest.each_slice((largest.size / 2.0).ceil).to_a
-      groups.concat(splits).sort_by!(&:size)
+  # @return [void]
+  def validate_unique_donor_ids!(pools)
+    pools.each_with_index do |pool, index|
+      donor_ids = pool.map { |well| well.aliquots.first.sample.sample_metadata.donor_id }
+      next unless donor_ids.uniq.size != donor_ids.size
+      raise "Pool #{index + 1} contains duplicate donor IDs: #{donor_ids.tally.select { |_id, count| count > 1 }.keys}"
     end
-    groups
   end
 
   # This method checks the pool for full allowance and adjusts the number of
@@ -305,3 +295,4 @@ def create_new_well_metadata(metadata_key, metadata_value, dest_well)
             "did not save on destination well at location #{dest_well.location}"
   end
 end
+# rubocop:enable Metrics/ModuleLength
diff --git a/app/models/labware_creators/donor_pooling_plate.rb b/app/models/labware_creators/donor_pooling_plate.rb
@@ -53,6 +53,8 @@ class DonorPoolingPlate < Base
       wells.qc_results
     ].freeze
 
+    VALID_POOL_COUNT_RANGE = Rails.application.config.scrna_config[:valid_pool_count_range]
+
     # Returns the number of source plates from the purpose configuration.
     #
     # @return [Integer] The number of source plates.
@@ -93,31 +95,6 @@ def source_wells_for_pooling
       well_filter.filtered.map(&:first) # The first element is the well.
     end
 
-    # Returns the number of samples per pool set by the submission.
-    # Assumption for now is that it will be set the same for all requests in the source plates,
-    # and stored on request_metadata, so we can fetch it from the first sample in the first well.
-    def number_of_samples_per_pool
-      @number_of_samples_per_pool ||= fetch_number_of_samples_per_pool_from_request
-    end
-
-    # Raises an error if the number of samples per pool is not found.
-    # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
-    def fetch_number_of_samples_per_pool_from_request
-      source_wells = source_wells_for_pooling
-      return if source_wells.blank?
-
-      number_of_samples_per_pool =
-        source_wells.first&.aliquots&.first&.request&.request_metadata&.number_of_samples_per_pool || nil
-
-      if number_of_samples_per_pool.nil?
-        raise StandardError, 'Error: request_metadata.number_of_samples_per_pool is nil'
-      end
-
-      number_of_samples_per_pool
-    end
-
-    # rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
-
     # Returns a hash mapping each source well to its source plate. The hash
     # contains all source wells independent of the filtering.
     #
@@ -145,15 +122,15 @@ def barcodes=(barcodes)
       @minimal_barcodes = barcodes.compact_blank.map(&:strip)
     end
 
-    # Returns the number of pools based on the sample count from the lookup
-    # table.
+    # Returns the number of pools that this group of wells should be split between, pulled from request metadata.
+    # The number of pools is specified for each group of wells that share the same Study and Project.
     #
-    # @return [Integer] The number of pools.
-    def calculated_number_of_pools
-      return if source_wells_for_pooling.blank?
-
-      # div enfoces integer division
-      source_wells_for_pooling.count.div(number_of_samples_per_pool)
+    # @param [Array<Well>] group A group of wells from the source plate(s).
+    # @return [Integer] The number of pools that they should be split into.
+    # @raise [StandardError] If any required attribute is nil.
+    def number_of_pools(group)
+      group[0]&.aliquots&.first&.request&.request_metadata&.number_of_pools ||
+        (raise 'Number of pools is missing or nil')
     end
 
     # Creates transfer requests from source wells to the destination plate in
@@ -228,14 +205,22 @@ def tag_depth_hash
     end
 
     # Builds the pools for the destination plate. The wells are first grouped
-    # by study and project, then split by donor_ids, and finally distributed
-    # across pools.
+    # by study and project, then passed along to be allocated to pools.
     #
     # @return [Array<Array<Well>>] An array of well groups distributed across pools.
     def build_pools
-      groups = split_single_group_by_study_and_project(source_wells_for_pooling)
-      groups = split_groups_by_unique_donor_ids(groups)
-      distribute_groups_across_pools(groups, calculated_number_of_pools)
+      study_project_groups = split_single_group_by_study_and_project(source_wells_for_pooling)
+
+      # allocate_wells_to_pools returns an array of pools
+      # We get one of these for every study/project group, and then 'flatten' to get a single array of pools
+      built_pools = study_project_groups.flat_map { |group| allocate_wells_to_pools(group, number_of_pools(group)) }
+
+      unless VALID_POOL_COUNT_RANGE.cover?(built_pools.size)
+        raise "Invalid requested number of pools: must be between #{VALID_POOL_COUNT_RANGE.min} " \
+                "and #{VALID_POOL_COUNT_RANGE.max}. Provided: #{built_pools.size}."
+      end
+
+      built_pools
     end
 
     # This method determines if the pools have full allowance.

diff --git a/config/initializers/scrna_config.rb b/config/initializers/scrna_config.rb
@@ -38,5 +38,9 @@
   # Default total cell count threshold when passing/failing samples
   total_cell_count_default_threshold: 50_000,
   # Key for the number of cells per chip well metadata stored on pool wells (in poly_metadata)
-  number_of_cells_per_chip_well_key: 'scrna_core_pbmc_donor_pooling_number_of_cells_per_chip_well'
+  number_of_cells_per_chip_well_key: 'scrna_core_pbmc_donor_pooling_number_of_cells_per_chip_well',
+  # Valid number of samples in a pool (inclusive range), when pooling PBMCs in the cDNA prep stage
+  valid_pool_size_range: (5..25),
+  # Valid total number of pools (inclusive range), when pooling PBMCs in the cDNA prep stage
+  valid_pool_count_range: (1..8)
 }.freeze
diff --git a/spec/factories/request_factories.rb b/spec/factories/request_factories.rb
@@ -192,7 +192,7 @@
   factory :v2_request_metadata, class: Sequencescape::Api::V2::RequestMetadata do
     skip_create
 
-    number_of_samples_per_pool { nil }
+    number_of_pools { nil }
     cells_per_chip_well { nil }
   end
 end