lyrasis · kspurgin · Nov 11, 2025 · Nov 11, 2025 · Nov 11, 2025 · Nov 11, 2025
diff --git a/CHANGELOG.adoc b/CHANGELOG.adoc
@@ -45,6 +45,8 @@ In https://github.com/lyrasis/kiba-extend/pull/229[PR#229]
 In https://github.com/lyrasis/kiba-extend/pull/230[PR#230]
 * Ability to flexibly define `:lookup_on` keys (and method names for accessing the lookups) from with jobs.
 In https://github.com/lyrasis/kiba-extend/pull/234[PR#234]
+* `Deduplicate::FieldGroup` transform.
+In https://github.com/lyrasis/kiba-extend/pull/237[PR#237]
 
 === Bugfixes
 

diff --git a/Gemfile.lock b/Gemfile.lock
@@ -130,7 +130,7 @@ GEM
     rspec-mocks (3.13.5)
       diff-lcs (>= 1.2.0, < 2.0)
       rspec-support (~> 3.13.0)
-    rspec-support (3.13.5)
+    rspec-support (3.13.6)
     rubocop (1.80.2)
       json (~> 2.3)
       language_server-protocol (~> 3.17.0.2)

diff --git a/lib/kiba/extend/transforms/deduplicate.rb b/lib/kiba/extend/transforms/deduplicate.rb
@@ -1,32 +1,60 @@
 # frozen_string_literal: true
 
-# rubocop:todo Layout/LineLength
-
 module Kiba
   module Extend
     module Transforms
       # Tranformations that do some sort of data deduplication
       #
-      # ## Choosing between similar transforms
-      #
-      # - {Deduplicate::Flag} - Keeps all rows. The first row with a duplicate value is **not** marked as
-      #   a duplicate. Subsequent rows with the same value are marked as duplicates. Use to non-destructively
-      #   identify what will be kept/lost in a deduplication process, so you set up subsequent jobs to
-      #   (a) report dropped non-unique rows; and (b) process the unique (non-duplicate) rows. Works one row
-      #   at a time, so there is no performance implication due to source size.
-      # - {Deduplicate::FlagAll} - Keeps all rows. All rows with a duplicate value are marked as duplicates.
-      #   This is most helpful if you are wanting to review all duplicate values in a result together, or if
-      #   you need to, in a subsequent step, filter out all values that are not unique. Holds all rows in
-      #   memory while processing, so may be slow or even fail with very large source data.
-      # - {Deduplicate::Table} - Destructive! Removes rows. Keeps only the first row of any rows with the same
-      #   value in the specified field. Holds all rows in memory while processing, so may be slow or even fail
-      #   with very large source data. Equivalent to running {Deduplicate::Flag} followed by
-      #   {FilterRows::FieldEqualTo} (to reject duplicate rows), which should be used if size of source data
-      #   is a problem.
+      # ## Scope: Entire table
+      #
+      # - {Deduplicate::Flag} - Keeps all rows. The first row with a
+      #   duplicate value is **not** marked as a duplicate. Subsequent
+      #   rows with the same value are marked as duplicates. Use to
+      #   non-destructively identify what will be kept/lost in a
+      #   deduplication process, so you set up subsequent jobs to (a)
+      #   report dropped non-unique rows; and (b) process the unique
+      #   (non-duplicate) rows. Works one row at a time, so there is
+      #   no performance implication due to source size.
+      # - {Deduplicate::FlagAll} - Keeps all rows. All rows with a
+      #   duplicate value are marked as duplicates. This is most
+      #   helpful if you are wanting to review all duplicate values in
+      #   a result together, or if you need to, in a subsequent step,
+      #   filter out all values that are not unique. Holds all rows in
+      #   memory while processing, so may be slow or even fail with
+      #   very large source data.
+      # - {Deduplicate::Table} - Destructive! Removes rows. Keeps only
+      #   the first row of any rows with the same value in the
+      #   specified field. Holds all rows in memory while processing,
+      #   so may be slow or even fail with very large source data.
+      #   Equivalent to running {Deduplicate::Flag} followed by
+      #   {FilterRows::FieldEqualTo} (to reject duplicate rows), which
+      #   should be used if size of source data is a problem.
+      #
+      # ## Scope: Row - values in multiple fields in a field group
+      #
+      # - {Deduplicate::GroupedFieldValues} - Keeps all rows. Deduplicates
+      #   values in **one field** in a field group. That is, the values of
+      #   that single field are compared and deduplicated. The positions of
+      #   removed duplicate values are used to remove the corresponding values
+      #   in grouped fields. The actual values of the other fields in the group
+      #   are not considered.
+      # - {Deduplicate::FieldGroup} - Keeps all rows. Compares and deduplicates
+      #   entire field group. If there are 4 fields in the group, and the values
+      #   in the first and third positions of all 4 fields are the same, the
+      #   values in the third position are dropped from all 4 fields.
+      #
+      # ## Scope: Row - two or more non-grouped fields
+      #
+      # - {Deduplicate::Fields} - Keeps all rows. Deletes value(s) from target
+      #   fields if value(s) exist in source field.
+      #
+      # ## Scope: Row - multiple values in a single field
+      #
+      # - {Deduplicate::FieldValues} - Keeps all rows. Deletes value(s) from a
+      #   single multi-value field
       module Deduplicate
         ::Deduplicate = Kiba::Extend::Transforms::Deduplicate
       end
     end
   end
 end
-# rubocop:enable Layout/LineLength
diff --git a/lib/kiba/extend/transforms/deduplicate/field_group.rb b/lib/kiba/extend/transforms/deduplicate/field_group.rb
@@ -0,0 +1,189 @@
+# frozen_string_literal: true
+
+module Kiba
+  module Extend
+    module Transforms
+      module Deduplicate
+        # Field value deduplication that is at least semi-safe for use with
+        #   grouped fields that expect the same number of values for each field
+        #   in the grouping
+        #
+        # @note Tread with caution, as this has not been used much and is not
+        #   extensively tested.
+        #
+        # @example Basic usage/defaults
+        #   # Used in pipeline as:
+        #   # transform Deduplicate::FieldGroup,
+        #   #   grouped_fields: %i[name work role],
+        #   #   delim: ';'
+        #   xform = Deduplicate::FieldGroup.new(
+        #     grouped_fields: %i[name work role],
+        #     delim: ';'
+        #   )
+        #   input = [
+        #     # nothing in group
+        #     {name: nil,
+        #      work: nil,
+        #      role: nil},
+        #     # single group
+        #     {name: "Sue",
+        #      work: "Bk",
+        #      role: "auth"},
+        #     # nil grouped field
+        #     {name: "Sue;Sue;Sue",
+        #      work: nil,
+        #      role: "auth;ed;auth"},
+        #     # nil value in other field
+        #     {name: "Sue;Jill;Joan;Jill",
+        #      work: "Bk;;Bk;",
+        #      role: "auth;auth;ed;auth"},
+        #     # work is empty string value; role has only 2 values
+        #     {name: "Cam;Jan;Cam",
+        #      work: "",
+        #      role: "auth;ed"},
+        #     # lots of values, multiple duplicates
+        #     {name: "Fred;Jan;Fred;Bob;Fred;Bob",
+        #      work: "Rpt;Bk;Paper;Bk;Rpt;Bk",
+        #      role: "auth;photog;ed;ill;auth;ed."}
+        #   ]
+        #   result = input.map{ |row| xform.process(row) }
+        #   expected = [
+        #     # nothing in group
+        #     {name: nil,
+        #      work: nil,
+        #      role: nil},
+        #     # single group
+        #     {name: "Sue",
+        #      work: "Bk",
+        #      role: "auth"},
+        #     # nil grouped field
+        #     {name: "Sue;Sue",
+        #      work: nil,
+        #      role: "auth;ed"},
+        #     # nil value in other field
+        #     {name: "Sue;Jill;Joan",
+        #      work: "Bk;;Bk",
+        #      role: "auth;auth;ed"},
+        #     # work is empty string value; role has only 2 values
+        #     {name: "Cam;Jan;Cam",
+        #      work: "",
+        #      role: "auth;ed"},
+        #     # lots of values, multiple duplicates
+        #     {name: "Fred;Jan;Fred;Bob;Bob",
+        #      work: "Rpt;Bk;Paper;Bk;Bk",
+        #      role: "auth;photog;ed;ill;ed."}
+        #   ]
+        #   expect(result).to eq(expected)
+        # @example Case insensitive deduplication
+        #   xform = Deduplicate::FieldGroup.new(
+        #     grouped_fields: %i[name role],
+        #     delim: ';',
+        #     ignore_case: true
+        #   )
+        #   input = [
+        #     {name: 'Jan;jan',
+        #      role: 'auth;Auth'},
+        #   ]
+        #   result = input.map{ |row| xform.process(row) }
+        #   expected = [
+        #     {name: 'Jan',
+        #      role: 'auth'},
+        #   ]
+        #   expect(result).to eq(expected)
+        # @example Normalized deduplication
+        #   xform = Deduplicate::FieldGroup.new(
+        #     grouped_fields: %i[name role],
+        #     delim: ';',
+        #     normalized: true
+        #   )
+        #   input = [
+        #     {name: 'Jan;Jan.;Sam;Sam?;Hops',
+        #      role: 'auth./ill.;auth, ill;ed;ed.;Ed.'},
+        #   ]
+        #   result = input.map{ |row| xform.process(row) }
+        #   expected = [
+        #     {name: 'Jan;Sam;Hops',
+        #      role: 'auth./ill.;ed;Ed.'},
+        #   ]
+        #   expect(result).to eq(expected)
+        class FieldGroup
+          # @param grouped_fields [Array<Symbol>] fields in the
+          #   multi-field grouping to be deduplicated.
+          # @param delim [nil, String] used to split/join multivalued field
+          #   values
+          # @param ignore_case [Boolean]
+          # @param normalized [Boolean] if true, will apply
+          #   {Kiba::Extend::Utils::StringNormalizer} with arguments:
+          #   `mode: :plain, downcased: false` to values for comparison
+          def initialize(grouped_fields: [], delim: Kiba::Extend.delim,
+            ignore_case: false, normalized: false)
+            @fields = grouped_fields
+            @delim = delim
+            @getter = Kiba::Extend::Transforms::Helpers::FieldValueGetter.new(
+              fields: grouped_fields
+            )
+            @ignore_case = ignore_case
+            if normalized
+              @normalizer = Utils::StringNormalizer.new(downcased: false)
+            end
+          end
+
+          # @param row [Hash{ Symbol => String, nil }]
+          def process(row)
+            vals = getter.call(row)
+            return row if vals.empty?
+            return row if vals.values.none? { |v| v.match?(delim) }
+
+            vals.transform_values! do |v|
+              v.split(delim).map { |v| v.empty? ? nil : v }
+            end
+
+            keep = indexes_to_keep(vals)
+            deduplicate(vals, keep).each do |field, vals|
+              row[field] = vals.join(delim)
+            end
+
+            row
+          end
+
+          private
+
+          attr_reader :fields, :delim, :getter, :ignore_case, :normalizer
+
+          def indexes_to_keep(vals)
+            build_comparable(vals).to_a
+              .uniq { |arr| arr[1] }
+              .map { |arr| arr[0] }
+              .flatten
+          end
+
+          def deduplicate(vals, keep)
+            vals.clone.transform_values! do |arr|
+              arr.select.with_index { |v, i| keep.include?(i) }
+            end
+          end
+
+          def build_comparable(vals)
+            cvals = vals.dup.transform_values! { |v| v.dup }
+            if ignore_case
+              cvals.transform_values! { |vs| vs.map { |v| v.downcase } }
+            end
+            if normalizer
+              cvals.transform_values! do |vs|
+                vs.map { |v| normalizer.call(v) }
+              end
+            end
+
+            acc = {}
+            ct = 0
+            until cvals.values.all? { |arr| arr.empty? }
+              acc[ct] = cvals.values.map { |arr| arr.shift }
+              ct += 1
+            end
+            acc
+          end
+        end
+      end
+    end
+  end
+end