Text extraction for macOS powered by Apple Vision — the same engine used in Finder, Quick Look, and Preview.
Supports images and PDFs across 20+ languages. Use it as a CLI tool, embed it as a Swift library, or call it from Python.
- macOS 13 or later
- Xcode 15 / Swift 5.9 or later (for building from source)
git clone https://github.com/yourusername/VisionOCR.git
cd VisionOCR
swift build -c release
sudo bash scripts/install.shWithout sudo, install to your home directory:
bash scripts/install.sh ~/binIf ~/bin is not in your PATH:
echo 'export PATH="$HOME/bin:$PATH"' >> ~/.zshrc
source ~/.zshrcVerify:
ocr --version| Type | Extensions |
|---|---|
| Images | png, jpg, jpeg, tiff, tif, bmp, gif, heic, heif, webp |
| Documents |
# Single file to stdout
ocr document.pdf
# Write to file
ocr scan.png -o scan.txt
# Process a directory recursively
ocr ./inbox -o ./results
# Non-recursive
ocr ./inbox -o ./results --no-recursive# Plain text (default)
ocr document.pdf -o document.txt
# JSON with metadata
ocr document.pdf -f json
# JSON with per-block bounding boxes
ocr document.pdf -f json --blocksJSON output structure:
{
"sourceFile": "/path/to/document.pdf",
"pageCount": 2,
"confidence": 0.94,
"text": "Full extracted text...",
"blocks": [
{
"page": 1,
"text": "Invoice total: 142.50 EUR",
"confidence": 0.98,
"x": 0.6,
"y": 0.87,
"width": 0.3,
"height": 0.04
}
]
}Bounding box coordinates are normalized (0.0–1.0 relative to page dimensions).
# Single language
ocr document.pdf -l de-DE
# Multiple languages
ocr document.pdf -l de-DE,en-US,fr-FR
# Arabic
ocr document.pdf -l ar
# List all supported languages on this system
ocr --languages# Fast mode (less accurate, significantly quicker on large batches)
ocr ./inbox -o ./results --fast
# Minimum confidence threshold (0.0-1.0)
ocr scan.png --min-confidence 0.8
# PDF rasterization DPI (default: 150, increase for better quality)
ocr scan.pdf --dpi 300ocr invoice.pdf | grep -i "total"
ocr receipt.png | pbcopy-o, --output <path> Output file or directory
-f, --format <fmt> Output format: txt (default), json
-l, --lang <codes> Comma-separated BCP-47 language codes
--fast Fast recognition mode
--no-recursive Do not recurse into subdirectories
--blocks Include per-block bounding boxes (json only)
--min-confidence <n> Minimum confidence 0.0-1.0 (default: 0.0)
--dpi <n> PDF rasterization DPI (default: 150)
--languages List all supported recognition languages
-v, --version Print version
-h, --help Show this help
Copy python/ocr.py next to your script. Requires ocr to be installed and on your PATH, or set the OCR_BIN environment variable.
from ocr import VisionOCR
client = VisionOCR()
# Single file — plain text
text = client.extract_text("document.pdf")
# Single file — full result with metadata
result = client.extract("scan.png")
print(result.text)
print(result.confidence)
print(result.page_count)
# Batch directory
client.extract_directory("./inbox", "./results")
# Iterate over results
for path, result in client.iter_directory("./inbox"):
print(f"{path.name}: {result.text[:80]}")
# Language selection
client = VisionOCR(languages=["ar", "en-US"])
text = client.extract_text("arabic_document.pdf")
# With bounding boxes
client = VisionOCR(include_blocks=True)
result = client.extract("invoice.pdf")
for block in result.blocks:
print(f"Page {block.page} ({block.confidence:.2f}): {block.text}")
# All options
client = VisionOCR(
languages=["de-DE", "en-US"],
fast=False,
min_confidence=0.8,
dpi=300,
include_blocks=True,
binary="/usr/local/bin/ocr", # explicit path, or set OCR_BIN env var
)Add to your Package.swift:
.package(url: "https://github.com/yourusername/VisionOCR.git", from: "1.0.0")Add to your target dependencies:
.product(name: "VisionOCRCore", package: "VisionOCR")Usage:
import VisionOCRCore
// Basic
let engine = OCREngine()
let result = try engine.process(file: URL(fileURLWithPath: "document.pdf"))
print(result.text)
// With options
let options = OCROptions(
languages: ["ar", "en-US"],
recognitionLevel: .accurate,
minimumConfidence: 0.5,
includeBlocks: true,
pdfDPI: 300
)
let engine = OCREngine(options: options)
let result = try engine.process(file: url)
// Batch processing
let processor = FileProcessor(engine: engine, format: .json)
let report = processor.processDirectory(
at: inputDir,
outputDir: outputDir,
recursive: true
) { url, current, total in
print("[\(current)/\(total)] \(url.lastPathComponent)")
}
print("\(report.succeeded.count) succeeded, \(report.failed.count) failed")
// Supported languages
let languages = try OCREngine.supportedLanguages()VisionOCR uses VNRecognizeTextRequest from Apple's Vision framework — the same engine that powers text selection in Preview, Quick Look, and the Camera app. For PDFs with an existing text layer, the native layer is extracted directly without rasterization. For scanned PDFs and images, pages are rasterized to a bitmap and passed through Vision's on-device neural text recognizer. No data leaves your machine.
MIT