| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102 |
- #!/usr/bin/env python3
- """
- Word (.docx) -> HTML 自动转换器
- 监听 /watch 目录,docx 文件上传/修改后自动转成 HTML 写入 /output
- 说明:.doc(97-2003) 不支持,建议先另存为 .docx
- """
- import html
- import logging
- import time
- from pathlib import Path
- import mammoth
- from watchdog.events import FileSystemEventHandler
- from watchdog.observers.polling import PollingObserver
- WATCH_DIR = Path("/watch")
- OUTPUT_DIR = Path("/output")
- OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
- logging.basicConfig(
- level=logging.INFO,
- format="%(asctime)s [word-converter] %(levelname)s %(message)s",
- )
- log = logging.getLogger(__name__)
- def convert_docx(src: Path, dst: Path) -> None:
- with src.open("rb") as fd:
- result = mammoth.convert_to_html(fd)
- body = result.value
- warn_lines = ""
- if result.messages:
- warn_lines = "\n".join(html.escape(str(m)) for m in result.messages)
- warn_lines = (
- "<details><summary>转换提示</summary><pre>"
- + warn_lines
- + "</pre></details>"
- )
- page = (
- "<!doctype html><html><head><meta charset='utf-8'>"
- "<meta name='viewport' content='width=device-width, initial-scale=1'>"
- "<style>"
- "body{font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,Helvetica,Arial,sans-serif;"
- "max-width:980px;margin:24px auto;padding:0 16px;line-height:1.7;color:#1f2937;}"
- "img{max-width:100%;height:auto;}"
- "table{border-collapse:collapse;width:100%;margin:12px 0;}"
- "th,td{border:1px solid #d1d5db;padding:6px 10px;text-align:left;}"
- "pre{background:#f6f8fa;padding:12px;overflow:auto;}"
- "code{background:#f6f8fa;padding:2px 4px;}"
- "details{margin-top:20px;color:#6b7280;}"
- "</style></head><body>"
- f"<h1>{html.escape(src.name)}</h1>"
- f"{warn_lines}{body}</body></html>"
- )
- dst.write_text(page, encoding="utf-8")
- def convert_file(path: Path) -> None:
- suffix = path.suffix.lower()
- if suffix == ".doc":
- log.warning("Skip .doc (unsupported): %s", path.name)
- return
- if suffix != ".docx":
- return
- out = OUTPUT_DIR / (path.stem + ".html")
- try:
- convert_docx(path, out)
- log.info("Converted: %s -> %s", path.name, out.name)
- except Exception as e:
- log.error("Failed to convert %s: %s", path, e)
- class WordHandler(FileSystemEventHandler):
- def on_created(self, event):
- if not event.is_directory:
- convert_file(Path(event.src_path))
- def on_modified(self, event):
- if not event.is_directory:
- convert_file(Path(event.src_path))
- if __name__ == "__main__":
- log.info("Word converter started. Watching: %s", WATCH_DIR)
- for f in WATCH_DIR.rglob("*.docx"):
- convert_file(f)
- for f in WATCH_DIR.rglob("*.doc"):
- convert_file(f)
- observer = PollingObserver(timeout=2)
- observer.schedule(WordHandler(), str(WATCH_DIR), recursive=True)
- observer.start()
- try:
- while True:
- time.sleep(5)
- except KeyboardInterrupt:
- observer.stop()
- observer.join()
|