Skip to content

Commit 92ed416

Browse files
authored
Merge pull request #8 from unicef/increment-3-pipeline
Merge storyboard view into main
2 parents 69c7ff5 + 94d121d commit 92ed416

12 files changed

Lines changed: 1191 additions & 182 deletions

File tree

apps/api/src/app.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import { errorHandler } from "./middleware/error-handler.js"
66
import { healthRoutes } from "./routes/health.js"
77
import { createBookRoutes } from "./routes/books.js"
88
import { createPipelineRoutes } from "./routes/pipeline.js"
9+
import { createPageRoutes } from "./routes/pages.js"
910
import { createPipelineService } from "./services/pipeline-service.js"
1011
import { createPipelineRunner } from "./services/pipeline-runner.js"
1112

@@ -29,5 +30,6 @@ app.onError(errorHandler)
2930
app.route("/api", healthRoutes)
3031
app.route("/api", createBookRoutes(booksDir))
3132
app.route("/api", createPipelineRoutes(pipelineService, booksDir, promptsDir))
33+
app.route("/api", createPageRoutes(booksDir))
3234

3335
export default app

apps/api/src/routes/pages.test.ts

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
import { describe, it, expect, beforeEach, afterEach } from "vitest"
2+
import fs from "node:fs"
3+
import path from "node:path"
4+
import os from "node:os"
5+
import { Hono } from "hono"
6+
import { createBookStorage } from "@adt/storage"
7+
import { errorHandler } from "../middleware/error-handler.js"
8+
import { createPageRoutes } from "./pages.js"
9+
10+
describe("Page routes", () => {
11+
let tmpDir: string
12+
let app: Hono
13+
const label = "test-book"
14+
15+
beforeEach(() => {
16+
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "pages-routes-"))
17+
18+
// Create a book with extracted pages and pipeline data
19+
const storage = createBookStorage(label, tmpDir)
20+
try {
21+
// Simulate extracted pages
22+
const fakeImage = {
23+
imageId: `${label}_p1_page`,
24+
pngBuffer: Buffer.from("fake-png-data"),
25+
hash: "abc123",
26+
width: 800,
27+
height: 600,
28+
}
29+
storage.putExtractedPage({
30+
pageId: `${label}_p1`,
31+
pageNumber: 1,
32+
text: "Page one text content",
33+
pageImage: fakeImage,
34+
images: [],
35+
})
36+
37+
const fakeImage2 = {
38+
imageId: `${label}_p2_page`,
39+
pngBuffer: Buffer.from("fake-png-data-2"),
40+
hash: "def456",
41+
width: 800,
42+
height: 600,
43+
}
44+
storage.putExtractedPage({
45+
pageId: `${label}_p2`,
46+
pageNumber: 2,
47+
text: "Page two text content",
48+
pageImage: fakeImage2,
49+
images: [],
50+
})
51+
52+
// Simulate pipeline output for page 1
53+
storage.putNodeData("text-classification", `${label}_p1`, {
54+
reasoning: "test reasoning",
55+
groups: [
56+
{
57+
groupId: "g1",
58+
groupType: "body",
59+
texts: [
60+
{ textType: "paragraph", text: "Hello world", isPruned: false },
61+
],
62+
},
63+
],
64+
})
65+
storage.putNodeData("image-classification", `${label}_p1`, {
66+
images: [],
67+
})
68+
storage.putNodeData("page-sectioning", `${label}_p1`, {
69+
reasoning: "sectioned",
70+
sections: [
71+
{
72+
sectionType: "content",
73+
partIds: ["g1"],
74+
backgroundColor: "#ffffff",
75+
textColor: "#000000",
76+
pageNumber: 1,
77+
isPruned: false,
78+
},
79+
],
80+
})
81+
storage.putNodeData("web-rendering", `${label}_p1`, {
82+
sections: [
83+
{
84+
sectionIndex: 0,
85+
sectionType: "content",
86+
reasoning: "rendered",
87+
html: "<div>Hello world</div>",
88+
},
89+
],
90+
})
91+
} finally {
92+
storage.close()
93+
}
94+
95+
const routes = createPageRoutes(tmpDir)
96+
app = new Hono()
97+
app.onError(errorHandler)
98+
app.route("/api", routes)
99+
})
100+
101+
afterEach(() => {
102+
fs.rmSync(tmpDir, { recursive: true, force: true })
103+
})
104+
105+
describe("GET /api/books/:label/pages", () => {
106+
it("returns list of pages", async () => {
107+
const res = await app.request(`/api/books/${label}/pages`)
108+
109+
expect(res.status).toBe(200)
110+
const body = await res.json()
111+
expect(body).toHaveLength(2)
112+
expect(body[0].pageId).toBe(`${label}_p1`)
113+
expect(body[0].pageNumber).toBe(1)
114+
expect(body[0].hasRendering).toBe(true)
115+
expect(body[1].pageId).toBe(`${label}_p2`)
116+
expect(body[1].pageNumber).toBe(2)
117+
expect(body[1].hasRendering).toBe(false)
118+
})
119+
120+
it("returns 404 for nonexistent book", async () => {
121+
const res = await app.request("/api/books/no-such-book/pages")
122+
expect(res.status).toBe(404)
123+
})
124+
})
125+
126+
describe("GET /api/books/:label/pages/:pageId", () => {
127+
it("returns full page data with pipeline outputs", async () => {
128+
const res = await app.request(
129+
`/api/books/${label}/pages/${label}_p1`
130+
)
131+
132+
expect(res.status).toBe(200)
133+
const body = await res.json()
134+
expect(body.pageId).toBe(`${label}_p1`)
135+
expect(body.pageNumber).toBe(1)
136+
expect(body.text).toBe("Page one text content")
137+
expect(body.textClassification).toBeTruthy()
138+
expect(body.textClassification.groups).toHaveLength(1)
139+
expect(body.imagClassification).toBeFalsy // typo check
140+
expect(body.imageClassification).toBeTruthy()
141+
expect(body.sectioning).toBeTruthy()
142+
expect(body.sectioning.sections).toHaveLength(1)
143+
expect(body.rendering).toBeTruthy()
144+
expect(body.rendering.sections[0].html).toBe(
145+
"<div>Hello world</div>"
146+
)
147+
})
148+
149+
it("returns page without pipeline data if not processed", async () => {
150+
const res = await app.request(
151+
`/api/books/${label}/pages/${label}_p2`
152+
)
153+
154+
expect(res.status).toBe(200)
155+
const body = await res.json()
156+
expect(body.pageId).toBe(`${label}_p2`)
157+
expect(body.textClassification).toBeNull()
158+
expect(body.rendering).toBeNull()
159+
})
160+
161+
it("returns 404 for nonexistent page", async () => {
162+
const res = await app.request(
163+
`/api/books/${label}/pages/fake-page`
164+
)
165+
expect(res.status).toBe(404)
166+
})
167+
})
168+
169+
describe("GET /api/books/:label/pages/:pageId/image", () => {
170+
it("returns page image as base64 JSON", async () => {
171+
const res = await app.request(
172+
`/api/books/${label}/pages/${label}_p1/image`
173+
)
174+
175+
expect(res.status).toBe(200)
176+
const body = await res.json()
177+
expect(body.imageBase64).toBeTruthy()
178+
expect(typeof body.imageBase64).toBe("string")
179+
})
180+
181+
it("returns 404 for nonexistent page image", async () => {
182+
const res = await app.request(
183+
`/api/books/${label}/pages/fake-page/image`
184+
)
185+
expect(res.status).toBe(404)
186+
})
187+
})
188+
})

apps/api/src/routes/pages.ts

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
import fs from "node:fs"
2+
import path from "node:path"
3+
import { Hono } from "hono"
4+
import { HTTPException } from "hono/http-exception"
5+
import { parseBookLabel } from "@adt/types"
6+
import { openBookDb } from "@adt/storage"
7+
import { createBookStorage } from "@adt/storage"
8+
9+
interface PageSummary {
10+
pageId: string
11+
pageNumber: number
12+
hasRendering: boolean
13+
}
14+
15+
interface PageDetail {
16+
pageId: string
17+
pageNumber: number
18+
text: string
19+
textClassification: unknown | null
20+
imageClassification: unknown | null
21+
sectioning: unknown | null
22+
rendering: unknown | null
23+
}
24+
25+
function getDbPath(label: string, booksDir: string): string {
26+
const safeLabel = parseBookLabel(label)
27+
return path.join(path.resolve(booksDir), safeLabel, `${safeLabel}.db`)
28+
}
29+
30+
export function createPageRoutes(booksDir: string): Hono {
31+
const app = new Hono()
32+
33+
// GET /books/:label/pages — List pages with pipeline status
34+
app.get("/books/:label/pages", (c) => {
35+
const { label } = c.req.param()
36+
const safeLabel = parseBookLabel(label)
37+
const dbPath = getDbPath(safeLabel, booksDir)
38+
39+
if (!fs.existsSync(dbPath)) {
40+
throw new HTTPException(404, {
41+
message: `Book not found or not yet extracted: ${safeLabel}`,
42+
})
43+
}
44+
45+
const db = openBookDb(dbPath)
46+
try {
47+
const pages = db.all(
48+
"SELECT page_id, page_number FROM pages ORDER BY page_number"
49+
) as Array<{ page_id: string; page_number: number }>
50+
51+
// Check which pages have web-rendering output
52+
const rendered = new Set<string>()
53+
const renderRows = db.all(
54+
"SELECT DISTINCT item_id FROM node_data WHERE node = ?",
55+
["web-rendering"]
56+
) as Array<{ item_id: string }>
57+
for (const row of renderRows) {
58+
rendered.add(row.item_id)
59+
}
60+
61+
const result: PageSummary[] = pages.map((p) => ({
62+
pageId: p.page_id,
63+
pageNumber: p.page_number,
64+
hasRendering: rendered.has(p.page_id),
65+
}))
66+
67+
return c.json(result)
68+
} finally {
69+
db.close()
70+
}
71+
})
72+
73+
// GET /books/:label/pages/:pageId — Full page data with pipeline outputs
74+
app.get("/books/:label/pages/:pageId", (c) => {
75+
const { label, pageId } = c.req.param()
76+
const safeLabel = parseBookLabel(label)
77+
const dbPath = getDbPath(safeLabel, booksDir)
78+
79+
if (!fs.existsSync(dbPath)) {
80+
throw new HTTPException(404, {
81+
message: `Book not found: ${safeLabel}`,
82+
})
83+
}
84+
85+
const db = openBookDb(dbPath)
86+
try {
87+
// Get page data
88+
const pageRows = db.all(
89+
"SELECT page_id, page_number, text FROM pages WHERE page_id = ?",
90+
[pageId]
91+
) as Array<{ page_id: string; page_number: number; text: string }>
92+
93+
if (pageRows.length === 0) {
94+
throw new HTTPException(404, {
95+
message: `Page not found: ${pageId}`,
96+
})
97+
}
98+
99+
const page = pageRows[0]
100+
101+
// Get pipeline outputs
102+
const getNodeData = (node: string): unknown | null => {
103+
const rows = db.all(
104+
"SELECT data FROM node_data WHERE node = ? AND item_id = ? ORDER BY version DESC LIMIT 1",
105+
[node, pageId]
106+
) as Array<{ data: string }>
107+
if (rows.length === 0) return null
108+
return JSON.parse(rows[0].data)
109+
}
110+
111+
const result: PageDetail = {
112+
pageId: page.page_id,
113+
pageNumber: page.page_number,
114+
text: page.text,
115+
textClassification: getNodeData("text-classification"),
116+
imageClassification: getNodeData("image-classification"),
117+
sectioning: getNodeData("page-sectioning"),
118+
rendering: getNodeData("web-rendering"),
119+
}
120+
121+
return c.json(result)
122+
} finally {
123+
db.close()
124+
}
125+
})
126+
127+
// GET /books/:label/pages/:pageId/image — Page image as base64
128+
app.get("/books/:label/pages/:pageId/image", (c) => {
129+
const { label, pageId } = c.req.param()
130+
const safeLabel = parseBookLabel(label)
131+
const resolvedDir = path.resolve(booksDir)
132+
const bookDir = path.join(resolvedDir, safeLabel)
133+
const dbPath = path.join(bookDir, `${safeLabel}.db`)
134+
135+
if (!fs.existsSync(dbPath)) {
136+
throw new HTTPException(404, {
137+
message: `Book not found: ${safeLabel}`,
138+
})
139+
}
140+
141+
const db = openBookDb(dbPath)
142+
try {
143+
// Look up the page image path
144+
const imageId = `${pageId}_page`
145+
const rows = db.all(
146+
"SELECT path FROM images WHERE image_id = ?",
147+
[imageId]
148+
) as Array<{ path: string }>
149+
150+
if (rows.length === 0) {
151+
throw new HTTPException(404, {
152+
message: `Page image not found: ${pageId}`,
153+
})
154+
}
155+
156+
const imagePath = path.resolve(bookDir, rows[0].path)
157+
// Verify path doesn't escape book directory
158+
if (!imagePath.startsWith(bookDir + path.sep) && imagePath !== bookDir) {
159+
throw new HTTPException(400, { message: "Invalid image path" })
160+
}
161+
162+
const imageBase64 = fs.readFileSync(imagePath).toString("base64")
163+
return c.json({ imageBase64 })
164+
} finally {
165+
db.close()
166+
}
167+
})
168+
169+
return app
170+
}

0 commit comments

Comments
 (0)