convert_word.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. #!/usr/bin/env python3
  2. """
  3. Word (.docx) -> HTML 自动转换器
  4. 监听 /watch 目录,docx 文件上传/修改后自动转成 HTML 写入 /output
  5. 说明:.doc(97-2003) 不支持,建议先另存为 .docx
  6. """
  7. import html
  8. import logging
  9. import time
  10. from pathlib import Path
  11. import mammoth
  12. from watchdog.events import FileSystemEventHandler
  13. from watchdog.observers.polling import PollingObserver
  14. WATCH_DIR = Path("/watch")
  15. OUTPUT_DIR = Path("/output")
  16. OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
  17. logging.basicConfig(
  18. level=logging.INFO,
  19. format="%(asctime)s [word-converter] %(levelname)s %(message)s",
  20. )
  21. log = logging.getLogger(__name__)
  22. def convert_docx(src: Path, dst: Path) -> None:
  23. with src.open("rb") as fd:
  24. result = mammoth.convert_to_html(fd)
  25. body = result.value
  26. warn_lines = ""
  27. if result.messages:
  28. warn_lines = "\n".join(html.escape(str(m)) for m in result.messages)
  29. warn_lines = (
  30. "<details><summary>转换提示</summary><pre>"
  31. + warn_lines
  32. + "</pre></details>"
  33. )
  34. page = (
  35. "<!doctype html><html><head><meta charset='utf-8'>"
  36. "<meta name='viewport' content='width=device-width, initial-scale=1'>"
  37. "<style>"
  38. "body{font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,Helvetica,Arial,sans-serif;"
  39. "max-width:980px;margin:24px auto;padding:0 16px;line-height:1.7;color:#1f2937;}"
  40. "img{max-width:100%;height:auto;}"
  41. "table{border-collapse:collapse;width:100%;margin:12px 0;}"
  42. "th,td{border:1px solid #d1d5db;padding:6px 10px;text-align:left;}"
  43. "pre{background:#f6f8fa;padding:12px;overflow:auto;}"
  44. "code{background:#f6f8fa;padding:2px 4px;}"
  45. "details{margin-top:20px;color:#6b7280;}"
  46. "</style></head><body>"
  47. f"<h1>{html.escape(src.name)}</h1>"
  48. f"{warn_lines}{body}</body></html>"
  49. )
  50. dst.write_text(page, encoding="utf-8")
  51. def convert_file(path: Path) -> None:
  52. suffix = path.suffix.lower()
  53. if suffix == ".doc":
  54. log.warning("Skip .doc (unsupported): %s", path.name)
  55. return
  56. if suffix != ".docx":
  57. return
  58. out = OUTPUT_DIR / (path.stem + ".html")
  59. try:
  60. convert_docx(path, out)
  61. log.info("Converted: %s -> %s", path.name, out.name)
  62. except Exception as e:
  63. log.error("Failed to convert %s: %s", path, e)
  64. class WordHandler(FileSystemEventHandler):
  65. def on_created(self, event):
  66. if not event.is_directory:
  67. convert_file(Path(event.src_path))
  68. def on_modified(self, event):
  69. if not event.is_directory:
  70. convert_file(Path(event.src_path))
  71. if __name__ == "__main__":
  72. log.info("Word converter started. Watching: %s", WATCH_DIR)
  73. for f in WATCH_DIR.rglob("*.docx"):
  74. convert_file(f)
  75. for f in WATCH_DIR.rglob("*.doc"):
  76. convert_file(f)
  77. observer = PollingObserver(timeout=2)
  78. observer.schedule(WordHandler(), str(WATCH_DIR), recursive=True)
  79. observer.start()
  80. try:
  81. while True:
  82. time.sleep(5)
  83. except KeyboardInterrupt:
  84. observer.stop()
  85. observer.join()