import Foundation
import Vision
import AppKit

if CommandLine.arguments.count < 3 {
    fputs("Usage: ocr_pages <image_dir> <output_json>\n", stderr)
    exit(1)
}
let imageDir = CommandLine.arguments[1]
let outputPath = CommandLine.arguments[2]
let fm = FileManager.default
let urls = try fm.contentsOfDirectory(at: URL(fileURLWithPath: imageDir), includingPropertiesForKeys: nil)
    .filter { $0.pathExtension.lowercased() == "png" || $0.pathExtension.lowercased() == "jpg" }
    .sorted { $0.lastPathComponent < $1.lastPathComponent }

var pages: [[String: Any]] = []
let request = VNRecognizeTextRequest()
request.recognitionLevel = .accurate
request.usesLanguageCorrection = true
request.recognitionLanguages = ["zh-Hans", "en-US"]
request.minimumTextHeight = 0.005

for (idx, url) in urls.enumerated() {
    guard let nsImage = NSImage(contentsOf: url),
          let cgImage = nsImage.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
        continue
    }
    let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
    try handler.perform([request])
    let observations = (request.results ?? []) as [VNRecognizedTextObservation]
    var items: [[String: Any]] = []
    for obs in observations {
        guard let top = obs.topCandidates(1).first else { continue }
        let bb = obs.boundingBox
        items.append([
            "text": top.string,
            "confidence": top.confidence,
            "x": bb.origin.x,
            "y": bb.origin.y,
            "w": bb.size.width,
            "h": bb.size.height
        ])
    }
    pages.append(["page": idx + 1, "file": url.path, "width": cgImage.width, "height": cgImage.height, "items": items])
    print("OCR page \(idx+1)/\(urls.count): \(items.count) items")
}
let data = try JSONSerialization.data(withJSONObject: ["pages": pages], options: [.prettyPrinted, .sortedKeys])
try data.write(to: URL(fileURLWithPath: outputPath))
print("Wrote \(outputPath)")