#!/usr/bin/env python3
"""
Word (.docx) -> HTML 自动转换器
监听 /watch 目录,docx 文件上传/修改后自动转成 HTML 写入 /output
说明:.doc(97-2003) 不支持,建议先另存为 .docx
"""
import html
import logging
import time
from pathlib import Path
import mammoth
from watchdog.events import FileSystemEventHandler
from watchdog.observers.polling import PollingObserver
WATCH_DIR = Path("/watch")
OUTPUT_DIR = Path("/output")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [word-converter] %(levelname)s %(message)s",
)
log = logging.getLogger(__name__)
def convert_docx(src: Path, dst: Path) -> None:
with src.open("rb") as fd:
result = mammoth.convert_to_html(fd)
body = result.value
warn_lines = ""
if result.messages:
warn_lines = "\n".join(html.escape(str(m)) for m in result.messages)
warn_lines = (
"转换提示
"
+ warn_lines
+ " "
)
page = (
"
"
""
""
f"{html.escape(src.name)}
"
f"{warn_lines}{body}"
)
dst.write_text(page, encoding="utf-8")
def convert_file(path: Path) -> None:
suffix = path.suffix.lower()
if suffix == ".doc":
log.warning("Skip .doc (unsupported): %s", path.name)
return
if suffix != ".docx":
return
out = OUTPUT_DIR / (path.stem + ".html")
try:
convert_docx(path, out)
log.info("Converted: %s -> %s", path.name, out.name)
except Exception as e:
log.error("Failed to convert %s: %s", path, e)
class WordHandler(FileSystemEventHandler):
def on_created(self, event):
if not event.is_directory:
convert_file(Path(event.src_path))
def on_modified(self, event):
if not event.is_directory:
convert_file(Path(event.src_path))
if __name__ == "__main__":
log.info("Word converter started. Watching: %s", WATCH_DIR)
for f in WATCH_DIR.rglob("*.docx"):
convert_file(f)
for f in WATCH_DIR.rglob("*.doc"):
convert_file(f)
observer = PollingObserver(timeout=2)
observer.schedule(WordHandler(), str(WATCH_DIR), recursive=True)
observer.start()
try:
while True:
time.sleep(5)
except KeyboardInterrupt:
observer.stop()
observer.join()