|
1 | 1 | # frozen_string_literal: true |
2 | 2 |
|
3 | | -# rubocop:todo Layout/LineLength |
4 | | - |
5 | 3 | module Kiba |
6 | 4 | module Extend |
7 | 5 | module Transforms |
8 | 6 | # Tranformations that do some sort of data deduplication |
9 | 7 | # |
10 | | - # ## Choosing between similar transforms |
11 | | - # |
12 | | - # - {Deduplicate::Flag} - Keeps all rows. The first row with a duplicate value is **not** marked as |
13 | | - # a duplicate. Subsequent rows with the same value are marked as duplicates. Use to non-destructively |
14 | | - # identify what will be kept/lost in a deduplication process, so you set up subsequent jobs to |
15 | | - # (a) report dropped non-unique rows; and (b) process the unique (non-duplicate) rows. Works one row |
16 | | - # at a time, so there is no performance implication due to source size. |
17 | | - # - {Deduplicate::FlagAll} - Keeps all rows. All rows with a duplicate value are marked as duplicates. |
18 | | - # This is most helpful if you are wanting to review all duplicate values in a result together, or if |
19 | | - # you need to, in a subsequent step, filter out all values that are not unique. Holds all rows in |
20 | | - # memory while processing, so may be slow or even fail with very large source data. |
21 | | - # - {Deduplicate::Table} - Destructive! Removes rows. Keeps only the first row of any rows with the same |
22 | | - # value in the specified field. Holds all rows in memory while processing, so may be slow or even fail |
23 | | - # with very large source data. Equivalent to running {Deduplicate::Flag} followed by |
24 | | - # {FilterRows::FieldEqualTo} (to reject duplicate rows), which should be used if size of source data |
25 | | - # is a problem. |
| 8 | + # ## Scope: Entire table |
| 9 | + # |
| 10 | + # - {Deduplicate::Flag} - Keeps all rows. The first row with a |
| 11 | + # duplicate value is **not** marked as a duplicate. Subsequent |
| 12 | + # rows with the same value are marked as duplicates. Use to |
| 13 | + # non-destructively identify what will be kept/lost in a |
| 14 | + # deduplication process, so you set up subsequent jobs to (a) |
| 15 | + # report dropped non-unique rows; and (b) process the unique |
| 16 | + # (non-duplicate) rows. Works one row at a time, so there is |
| 17 | + # no performance implication due to source size. |
| 18 | + # - {Deduplicate::FlagAll} - Keeps all rows. All rows with a |
| 19 | + # duplicate value are marked as duplicates. This is most |
| 20 | + # helpful if you are wanting to review all duplicate values in |
| 21 | + # a result together, or if you need to, in a subsequent step, |
| 22 | + # filter out all values that are not unique. Holds all rows in |
| 23 | + # memory while processing, so may be slow or even fail with |
| 24 | + # very large source data. |
| 25 | + # - {Deduplicate::Table} - Destructive! Removes rows. Keeps only |
| 26 | + # the first row of any rows with the same value in the |
| 27 | + # specified field. Holds all rows in memory while processing, |
| 28 | + # so may be slow or even fail with very large source data. |
| 29 | + # Equivalent to running {Deduplicate::Flag} followed by |
| 30 | + # {FilterRows::FieldEqualTo} (to reject duplicate rows), which |
| 31 | + # should be used if size of source data is a problem. |
| 32 | + # |
| 33 | + # ## Scope: Row - values in multiple fields in a field group |
| 34 | + # |
| 35 | + # - {Deduplicate::GroupedFieldValues} - Keeps all rows. Deduplicates |
| 36 | + # values in **one field** in a field group. That is, the values of |
| 37 | + # that single field are compared and deduplicated. The positions of |
| 38 | + # removed duplicate values are used to remove the corresponding values |
| 39 | + # in grouped fields. The actual values of the other fields in the group |
| 40 | + # are not considered. |
| 41 | + # - {Deduplicate::FieldGroup} - Keeps all rows. Compares and deduplicates |
| 42 | + # entire field group. If there are 4 fields in the group, and the values |
| 43 | + # in the first and third positions of all 4 fields are the same, the |
| 44 | + # values in the third position are dropped from all 4 fields. |
| 45 | + # |
| 46 | + # ## Scope: Row - two or more non-grouped fields |
| 47 | + # |
| 48 | + # - {Deduplicate::Fields} - Keeps all rows. Deletes value(s) from target |
| 49 | + # fields if value(s) exist in source field. |
| 50 | + # |
| 51 | + # ## Scope: Row - multiple values in a single field |
| 52 | + # |
| 53 | + # - {Deduplicate::FieldValues} - Keeps all rows. Deletes value(s) from a |
| 54 | + # single multi-value field |
26 | 55 | module Deduplicate |
27 | 56 | ::Deduplicate = Kiba::Extend::Transforms::Deduplicate |
28 | 57 | end |
29 | 58 | end |
30 | 59 | end |
31 | 60 | end |
32 | | -# rubocop:enable Layout/LineLength |
|
0 commit comments