Skip to content

Commit 405f7d6

Browse files
authored
Merge pull request #103 from benbernard/feature/comparison-gap-fixes
Fix Perl vs TypeScript comparison gaps
2 parents 2cc7420 + c52429c commit 405f7d6

37 files changed

Lines changed: 4344 additions & 373 deletions

bun.lock

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
"dependencies": {
3434
"@types/react": "^19.2.14",
3535
"better-sqlite3": "^12.6.2",
36+
"chrono-node": "^2.9.0",
3637
"exceljs": "^4.4.0",
3738
"fast-xml-parser": "^5.3.7",
3839
"ink": "^6.8.0",
@@ -47,6 +48,7 @@
4748
"papaparse": "^5.5.3",
4849
"pg": "^8.18.0",
4950
"react": "^19.2.4",
50-
"string-width": "^8.2.0"
51+
"string-width": "^8.2.0",
52+
"woothee": "^1.11.1"
5153
}
5254
}

src/aggregators/Ord2Bivariate.ts

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
import type { Aggregator } from "../Aggregator.ts";
2+
import type { Record } from "../Record.ts";
3+
import type { JsonValue } from "../types/json.ts";
4+
import { findKey } from "../KeySpec.ts";
5+
import { aggregatorRegistry } from "../Aggregator.ts";
6+
7+
// [sum1, sumX, sumY, sumXY, sumX2, sumY2]
8+
type Ord2BivState = [number, number, number, number, number, number];
9+
10+
/**
11+
* Second-order bivariate statistics aggregator.
12+
* Computes covariance, correlation, and linear regression parameters
13+
* between two fields using a single pass.
14+
*
15+
* Analogous to App::RecordStream::Aggregator::Ord2Bivariate in Perl.
16+
*/
17+
export class Ord2BivariateAggregator implements Aggregator<Ord2BivState | null> {
18+
fieldX: string;
19+
fieldY: string;
20+
21+
constructor(fieldX: string, fieldY: string) {
22+
this.fieldX = fieldX;
23+
this.fieldY = fieldY;
24+
}
25+
26+
initial(): Ord2BivState | null {
27+
return null;
28+
}
29+
30+
combine(state: Ord2BivState | null, record: Record): Ord2BivState | null {
31+
const vx = findKey(record.dataRef(), this.fieldX, true);
32+
const vy = findKey(record.dataRef(), this.fieldY, true);
33+
if (vx === undefined || vx === null || vy === undefined || vy === null) return state;
34+
const x = Number(vx);
35+
const y = Number(vy);
36+
const mapped: Ord2BivState = [1, x, y, x * y, x * x, y * y];
37+
if (state === null) return mapped;
38+
return [
39+
state[0] + mapped[0],
40+
state[1] + mapped[1],
41+
state[2] + mapped[2],
42+
state[3] + mapped[3],
43+
state[4] + mapped[4],
44+
state[5] + mapped[5],
45+
];
46+
}
47+
48+
squish(state: Ord2BivState | null): JsonValue {
49+
if (state === null) return null;
50+
const [n, sumX, sumY, sumXY, sumX2, sumY2] = state;
51+
52+
const meanX = sumX / n;
53+
const meanY = sumY / n;
54+
55+
// Covariance: E[XY] - E[X]*E[Y]
56+
const covariance = sumXY / n - meanX * meanY;
57+
58+
// Variances
59+
const varX = sumX2 / n - meanX * meanX;
60+
const varY = sumY2 / n - meanY * meanY;
61+
62+
// Correlation: cov / (stdX * stdY)
63+
const denominator = Math.sqrt(varX * varY);
64+
const correlation = denominator > 0
65+
? (sumXY * n - sumX * sumY) / Math.sqrt((sumX2 * n - sumX ** 2) * (sumY2 * n - sumY ** 2))
66+
: null;
67+
68+
// Linear regression: y = alpha + beta * x
69+
const betaDenom = sumX2 * n - sumX ** 2;
70+
const beta = betaDenom !== 0 ? (sumXY * n - sumX * sumY) / betaDenom : null;
71+
const alpha = beta !== null ? (sumY - beta * sumX) / n : null;
72+
73+
const result: { [key: string]: JsonValue } = {
74+
count: n,
75+
covariance,
76+
correlation,
77+
};
78+
79+
if (alpha !== null && beta !== null) {
80+
result["alpha"] = alpha;
81+
result["beta"] = beta;
82+
}
83+
84+
return result;
85+
}
86+
}
87+
88+
aggregatorRegistry.register("ord2biv", {
89+
create: (fieldX: string, fieldY: string) => new Ord2BivariateAggregator(fieldX, fieldY),
90+
argCounts: [2],
91+
shortUsage: "compute second-order bivariate statistics for two fields",
92+
longUsage:
93+
"Usage: ord2biv,<field1>,<field2>\n" +
94+
" Compute covariance, correlation, and linear regression parameters\n" +
95+
" between two fields.",
96+
aliases: ["ord2bivariate"],
97+
});

src/aggregators/Ord2Univariate.ts

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import type { Aggregator } from "../Aggregator.ts";
2+
import type { Record } from "../Record.ts";
3+
import type { JsonValue } from "../types/json.ts";
4+
import { findKey } from "../KeySpec.ts";
5+
import { aggregatorRegistry } from "../Aggregator.ts";
6+
7+
// [count, sumX, sumX2, sumX3, sumX4]
8+
type Ord2UniState = [number, number, number, number, number];
9+
10+
/**
11+
* Second-order univariate statistics aggregator.
12+
* Computes count, mean, variance, standard deviation, skewness, and kurtosis
13+
* for a single field using a single pass.
14+
*
15+
* Analogous to App::RecordStream::Aggregator::Ord2Univariate in Perl.
16+
*/
17+
export class Ord2UnivariateAggregator implements Aggregator<Ord2UniState | null> {
18+
field: string;
19+
20+
constructor(field: string) {
21+
this.field = field;
22+
}
23+
24+
initial(): Ord2UniState | null {
25+
return null;
26+
}
27+
28+
combine(state: Ord2UniState | null, record: Record): Ord2UniState | null {
29+
const value = findKey(record.dataRef(), this.field, true);
30+
if (value === undefined || value === null) return state;
31+
const x = Number(value);
32+
const mapped: Ord2UniState = [1, x, x * x, x * x * x, x * x * x * x];
33+
if (state === null) return mapped;
34+
return [
35+
state[0] + mapped[0],
36+
state[1] + mapped[1],
37+
state[2] + mapped[2],
38+
state[3] + mapped[3],
39+
state[4] + mapped[4],
40+
];
41+
}
42+
43+
squish(state: Ord2UniState | null): JsonValue {
44+
if (state === null) return null;
45+
const [n, sumX, sumX2, sumX3, sumX4] = state;
46+
47+
const mean = sumX / n;
48+
const variance = sumX2 / n - mean * mean;
49+
const stddev = Math.sqrt(variance);
50+
51+
const result: { [key: string]: JsonValue } = {
52+
count: n,
53+
mean,
54+
variance,
55+
stddev,
56+
};
57+
58+
// Skewness and kurtosis require variance > 0
59+
if (variance > 0) {
60+
// E[(X - mean)^3] = E[X^3] - 3*mean*E[X^2] + 2*mean^3
61+
const m3 = sumX3 / n - 3 * mean * (sumX2 / n) + 2 * mean * mean * mean;
62+
result["skewness"] = m3 / (stddev * stddev * stddev);
63+
64+
// E[(X - mean)^4] = E[X^4] - 4*mean*E[X^3] + 6*mean^2*E[X^2] - 3*mean^4
65+
const m4 = sumX4 / n - 4 * mean * (sumX3 / n) + 6 * mean * mean * (sumX2 / n) - 3 * mean * mean * mean * mean;
66+
result["kurtosis"] = m4 / (variance * variance);
67+
}
68+
69+
return result;
70+
}
71+
}
72+
73+
aggregatorRegistry.register("ord2uni", {
74+
create: (field: string) => new Ord2UnivariateAggregator(field),
75+
argCounts: [1],
76+
shortUsage: "compute second-order univariate statistics for a field",
77+
longUsage:
78+
"Usage: ord2uni,<field>\n" +
79+
" Compute count, mean, variance, standard deviation, skewness,\n" +
80+
" and kurtosis for the specified field.",
81+
aliases: ["ord2univariate"],
82+
});

src/aggregators/registry.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,5 +33,7 @@ import "./FirstRecord.ts";
3333
import "./LastRecord.ts";
3434
import "./RecordForMaximum.ts";
3535
import "./RecordForMinimum.ts";
36+
import "./Ord2Univariate.ts";
37+
import "./Ord2Bivariate.ts";
3638

3739
export { aggregatorRegistry, makeAggregators } from "../Aggregator.ts";

src/clumpers/Options.ts

Lines changed: 57 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -132,30 +132,38 @@ export class ClumperOptions {
132132
keyParts.push(String(val ?? ""));
133133
}
134134

135-
const groupKey = keyParts.join("\x1E");
136-
137135
if (!this.groups) {
138136
this.groups = new Map();
139137
}
140138

141-
let cookie = this.groups.get(groupKey);
142-
if (cookie === undefined) {
143-
// Handle LRU eviction if keySize is set and NOT in perfect mode
144-
if (!this.keyPerfect && this.keySize !== null && this.groups.size >= this.keySize) {
145-
const oldestKey = this.groupOrder.shift()!;
146-
const oldCookie = this.groups.get(oldestKey);
147-
if (oldCookie !== undefined) {
148-
this.callback.clumperCallbackEnd(oldCookie);
149-
this.groups.delete(oldestKey);
139+
// In cube mode, generate all 2^N combinations of actual values and "ALL"
140+
const combos = this.keyCube
141+
? this.cubeKeyValues(keySpecs, keyValues)
142+
: [{ keyValues, keyParts }];
143+
144+
for (const combo of combos) {
145+
const groupKey = combo.keyParts.join("\x1E");
146+
147+
let cookie = this.groups.get(groupKey);
148+
if (cookie === undefined) {
149+
// Handle LRU eviction if keySize is set and NOT in perfect mode
150+
if (!this.keyPerfect && this.keySize !== null && this.groups.size >= this.keySize) {
151+
const oldestKey = this.groupOrder.shift()!;
152+
const oldCookie = this.groups.get(oldestKey);
153+
if (oldCookie !== undefined) {
154+
this.callback.clumperCallbackEnd(oldCookie);
155+
this.groups.delete(oldestKey);
156+
}
150157
}
158+
159+
cookie = this.callback.clumperCallbackBegin(combo.keyValues);
160+
this.groups.set(groupKey, cookie);
161+
this.groupOrder.push(groupKey);
151162
}
152163

153-
cookie = this.callback.clumperCallbackBegin(keyValues);
154-
this.groups.set(groupKey, cookie);
155-
this.groupOrder.push(groupKey);
164+
this.callback.clumperCallbackPushRecord(cookie, record);
156165
}
157166

158-
this.callback.clumperCallbackPushRecord(cookie, record);
159167
return true;
160168
}
161169

@@ -183,6 +191,40 @@ export class ClumperOptions {
183191
}
184192
}
185193

194+
/**
195+
* Generate all 2^N combinations of actual key values and "ALL" for cube mode.
196+
*/
197+
cubeKeyValues(
198+
keySpecs: string[],
199+
keyValues: { [key: string]: JsonValue }
200+
): Array<{ keyValues: { [key: string]: JsonValue }; keyParts: string[] }> {
201+
const n = keySpecs.length;
202+
const combos: Array<{ keyValues: { [key: string]: JsonValue }; keyParts: string[] }> = [];
203+
204+
// Iterate all 2^N bitmasks
205+
for (let mask = 0; mask < (1 << n); mask++) {
206+
const comboValues: { [key: string]: JsonValue } = {};
207+
const comboParts: string[] = [];
208+
209+
for (let i = 0; i < n; i++) {
210+
const spec = keySpecs[i]!;
211+
if (mask & (1 << i)) {
212+
// Replace this key with "ALL"
213+
comboValues[spec] = "ALL";
214+
comboParts.push("ALL");
215+
} else {
216+
// Use actual value
217+
comboValues[spec] = keyValues[spec] ?? null;
218+
comboParts.push(String(keyValues[spec] ?? ""));
219+
}
220+
}
221+
222+
combos.push({ keyValues: comboValues, keyParts: comboParts });
223+
}
224+
225+
return combos;
226+
}
227+
186228
getKeySize(): number | null {
187229
return this.keySize;
188230
}

src/deaggregators/registry.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
/**
2+
* Deaggregator registry - central place for registering and looking up
3+
* deaggregators by name.
4+
*
5+
* Importing this module ensures all deaggregator implementations are registered.
6+
*/
7+
8+
// Import all deaggregator implementations to trigger their self-registration
9+
import "./Split.ts";
10+
import "./Unarray.ts";
11+
import "./Unhash.ts";

0 commit comments

Comments
 (0)