#!/usr/bin/env python3 """ Word (.docx) -> HTML 自动转换器 监听 /watch 目录,docx 文件上传/修改后自动转成 HTML 写入 /output 说明:.doc(97-2003) 不支持,建议先另存为 .docx """ import html import logging import time from pathlib import Path import mammoth from watchdog.events import FileSystemEventHandler from watchdog.observers.polling import PollingObserver WATCH_DIR = Path("/watch") OUTPUT_DIR = Path("/output") OUTPUT_DIR.mkdir(parents=True, exist_ok=True) logging.basicConfig( level=logging.INFO, format="%(asctime)s [word-converter] %(levelname)s %(message)s", ) log = logging.getLogger(__name__) def convert_docx(src: Path, dst: Path) -> None: with src.open("rb") as fd: result = mammoth.convert_to_html(fd) body = result.value warn_lines = "" if result.messages: warn_lines = "\n".join(html.escape(str(m)) for m in result.messages) warn_lines = ( "
转换提示
"
            + warn_lines
            + "
" ) page = ( "" "" "" f"

{html.escape(src.name)}

" f"{warn_lines}{body}" ) dst.write_text(page, encoding="utf-8") def convert_file(path: Path) -> None: suffix = path.suffix.lower() if suffix == ".doc": log.warning("Skip .doc (unsupported): %s", path.name) return if suffix != ".docx": return out = OUTPUT_DIR / (path.stem + ".html") try: convert_docx(path, out) log.info("Converted: %s -> %s", path.name, out.name) except Exception as e: log.error("Failed to convert %s: %s", path, e) class WordHandler(FileSystemEventHandler): def on_created(self, event): if not event.is_directory: convert_file(Path(event.src_path)) def on_modified(self, event): if not event.is_directory: convert_file(Path(event.src_path)) if __name__ == "__main__": log.info("Word converter started. Watching: %s", WATCH_DIR) for f in WATCH_DIR.rglob("*.docx"): convert_file(f) for f in WATCH_DIR.rglob("*.doc"): convert_file(f) observer = PollingObserver(timeout=2) observer.schedule(WordHandler(), str(WATCH_DIR), recursive=True) observer.start() try: while True: time.sleep(5) except KeyboardInterrupt: observer.stop() observer.join()