microsoft · bryanmull-datastack · May 12, 2026 · May 12, 2026 · May 12, 2026 · May 12, 2026
diff --git a/duodata-sync-action/.github/workflows/sync-from-google-sheets.yml b/duodata-sync-action/.github/workflows/sync-from-google-sheets.yml
@@ -0,0 +1,51 @@
+name: Sync from Google Sheets
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 12 * * *"   # daily at 12:00 UTC
+  push:
+    branches: [main]
+    paths:
+      - "scripts/**"
+      - ".github/workflows/sync-from-google-sheets.yml"
+
+permissions:
+  contents: write
+
+concurrency:
+  group: sync-from-google-sheets
+  cancel-in-progress: false
+
+jobs:
+  sync:
+    runs-on: ubuntu-latest
+    env:
+      SHEET_ID: 13BLY8046P6cPlaRV4I7IokDAYLn5NVr6rkucF8bJnPk
+      OUTPUT_PATH: duodata_semantic_view_mappings.md
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - run: pip install openpyxl requests
+
+      - name: Fetch sheet and render to Markdown
+        run: |
+          python scripts/sync_sheet.py \
+            --sheet-id "$SHEET_ID" \
+            --output "$OUTPUT_PATH"
+
+      - name: Commit and push if changed
+        run: |
+          if [[ -z "$(git status --porcelain "$OUTPUT_PATH")" ]]; then
+            echo "No changes."
+            exit 0
+          fi
+          git config user.name  "github-actions[bot]"
+          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
+          git add "$OUTPUT_PATH"
+          git commit -m "Sync $OUTPUT_PATH from Google Sheets"
+          git push
diff --git a/duodata-sync-action/README.md b/duodata-sync-action/README.md
@@ -0,0 +1,46 @@
+# duodata-sync-action template
+
+Drop these files into the **root** of `datastack-cloud/duodata-semantic-view-mappings`:
+
+```
+.github/workflows/sync-from-google-sheets.yml
+scripts/sync_sheet.py
+scripts/render_xlsx.py
+```
+
+## Setup
+
+1. In the new repo, **Settings -> Actions -> General -> Workflow permissions**: select
+   **Read and write permissions** so the workflow can push the regenerated MD back.
+2. Trigger once manually: **Actions -> Sync from Google Sheets -> Run workflow**.
+
+## What it does
+
+- Fetches the workbook via `https://docs.google.com/spreadsheets/d/<SHEET_ID>/export?format=xlsx`
+  (works for "Anyone with the link" sheets, no auth).
+- Renders to `duodata_semantic_view_mappings.md` using `scripts/render_xlsx.py`
+  (the structure-aware renderer: contiguous non-empty regions become separate
+  tables, single-cell header rows become `###` headings, empty columns trimmed).
+- Commits and pushes to `main` only if the file actually changed.
+
+## Configuration
+
+- `env.SHEET_ID` in the workflow holds the Google Sheets ID. Move it to
+  `vars.SHEET_ID` (repo Variables) if you'd prefer not to keep it in the YAML.
+- `env.OUTPUT_PATH` controls the output filename.
+- Schedule: `cron: "0 12 * * *"` (daily at 12:00 UTC). Adjust or delete the
+  `schedule:` block for manual-only runs.
+
+## If the sheet is later made private
+
+The export endpoint will return HTML/403. You'll need to switch to the Google
+Sheets API with a service account:
+
+1. Create a GCP service account, enable Sheets + Drive APIs, generate a JSON key.
+2. Share the sheet with the service account's email (Viewer).
+3. Store the JSON in a repo secret (e.g. `GOOGLE_SERVICE_ACCOUNT_JSON`).
+4. Replace `fetch_xlsx` in `scripts/sync_sheet.py` with a Drive `files.export`
+   call (mimeType `application/vnd.openxmlformats-officedocument.spreadsheetml.sheet`)
+   using `google-api-python-client`.
+
+Ping back if you want me to write that variant.
diff --git a/duodata-sync-action/scripts/render_xlsx.py b/duodata-sync-action/scripts/render_xlsx.py
@@ -0,0 +1,94 @@
+"""Structure-aware XLSX -> Markdown renderer.
+
+For each sheet, splits contiguous non-empty regions into separate Markdown
+tables, promotes single-cell header rows to ### headings, trims unused
+leading/trailing columns per block, and skips header-only blocks.
+"""
+import openpyxl
+
+
+def cell_to_md(value) -> str:
+    if value is None:
+        return ""
+    return str(value).strip().replace("|", "\\|").replace("\n", "<br>")
+
+
+def used_col_range(rows):
+    lo, hi = None, 0
+    for row in rows:
+        for i, v in enumerate(row):
+            if v not in (None, ""):
+                if lo is None or i < lo:
+                    lo = i
+                if i + 1 > hi:
+                    hi = i + 1
+    return (lo or 0, hi)
+
+
+def is_empty_row(row, n):
+    return all(c in (None, "") for c in row[:n])
+
+
+def non_empty_count(row, n):
+    return sum(1 for c in row[:n] if c not in (None, ""))
+
+
+def render_table(block, n):
+    lines = ["| " + " | ".join(cell_to_md(c) for c in block[0][:n]) + " |"]
+    lines.append("| " + " | ".join(["---"] * n) + " |")
+    for row in block[1:]:
+        lines.append("| " + " | ".join(cell_to_md(c) for c in row[:n]) + " |")
+    return "\n".join(lines)
+
+
+def render_paragraphs(block, n):
+    out = []
+    for row in block:
+        for c in row[:n]:
+            if c not in (None, ""):
+                out.append(cell_to_md(c))
+                break
+    return "\n\n".join(out)
+
+
+def render_sheet(ws) -> str:
+    rows = [list(r) for r in ws.iter_rows(values_only=True)]
+    _, max_col = used_col_range(rows)
+    if max_col == 0:
+        return f"## {ws.title}\n\n_(empty)_"
+
+    blocks, cur = [], []
+    for row in rows:
+        if is_empty_row(row, max_col):
+            if cur:
+                blocks.append(cur)
+                cur = []
+        else:
+            cur.append(row)
+    if cur:
+        blocks.append(cur)
+
+    parts = [f"## {ws.title}"]
+    for block in blocks:
+        lo, hi = used_col_range(block)
+        block = [row[lo:hi] for row in block]
+        n = hi - lo
+        if n <= 1:
+            parts.append(render_paragraphs(block, n))
+            continue
+        if non_empty_count(block[0], n) == 1 and len(block) > 1:
+            heading = next(
+                (cell_to_md(c) for c in block[0][:n] if c not in (None, "")), ""
+            )
+            parts.append(f"### {heading}")
+            block = block[1:]
+        if len(block) <= 1:
+            parts.append(render_paragraphs(block, n))
+        else:
+            parts.append(render_table(block, n))
+    return "\n\n".join(parts)
+
+
+def render_workbook(source) -> str:
+    wb = openpyxl.load_workbook(source, data_only=True)
+    return "\n\n".join(render_sheet(ws) for ws in wb.worksheets) + "\n"
diff --git a/duodata-sync-action/scripts/sync_sheet.py b/duodata-sync-action/scripts/sync_sheet.py
@@ -0,0 +1,42 @@
+"""Download a public Google Sheet as XLSX and render it to Markdown."""
+import argparse
+import io
+import os
+import sys
+
+import requests
+
+# Make sibling module importable when invoked as `python scripts/sync_sheet.py`.
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from render_xlsx import render_workbook  # noqa: E402
+
+
+def fetch_xlsx(sheet_id: str) -> bytes:
+    url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=xlsx"
+    # Browser-ish UA avoids occasional 403s from Google's export endpoint.
+    resp = requests.get(
+        url,
+        headers={"User-Agent": "Mozilla/5.0 (sync-from-google-sheets)"},
+        allow_redirects=True,
+        timeout=60,
+    )
+    resp.raise_for_status()
+    return resp.content
+
+
+def main() -> int:
+    p = argparse.ArgumentParser()
+    p.add_argument("--sheet-id", required=True)
+    p.add_argument("--output", required=True)
+    args = p.parse_args()
+
+    xlsx_bytes = fetch_xlsx(args.sheet_id)
+    markdown = render_workbook(io.BytesIO(xlsx_bytes))
+    with open(args.output, "w", encoding="utf-8") as fh:
+        fh.write(markdown)
+    print(f"Wrote {args.output} ({len(markdown)} chars)")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())