""" 图片处理模块。 将 Markdown 中的 ```` 标签解析为独立段落, 并在 Word 文档中插入图片及居中图标题。 """ import struct import re from docx import Document from docx.enum.text import WD_ALIGN_PARAGRAPH # 匹配 ... _IMG_TAG = re.compile( r']*?)src=["\']([^"\']+)["\']([^>]*?)>', re.IGNORECASE ) # 从标签属性块中提取名值对 _ATTR = re.compile(r'(\w+)\s*=\s*["\']([^"\']*)["\']') # 匹配 **图X ...** 重复图标题(注入时跳过) _FIG_CAPTION = re.compile( r'^\*\*(?:图|Table|Figure|Fig\.?)\s*\d+.*\*\*$', re.IGNORECASE ) def make_image_paragraph(block: str) -> dict | None: """若 *block* 包含 ```` 标签,返回图片段落字典;否则返回 ``None``。""" m = _IMG_TAG.search(block) if not m: return None attrs = dict(_ATTR.findall(block)) return { "type": "image", "src": attrs.get("src", m.group(2)), "alt": attrs.get("alt", ""), } def is_figure_caption(block: str) -> bool: """检查 *block* 是否为 ``**图X ...**`` 格式的重复图标题。""" return bool(_FIG_CAPTION.match(block.strip())) def _get_image_dimensions(image_path: str) -> tuple[int, int] | None: """读取图片文件头返回 ``(width_px, height_px)``,不支持格式返回 ``None``。 仅读取文件头,不依赖第三方库。支持 PNG / JPEG / GIF / BMP。 """ try: with open(image_path, "rb") as f: header = f.read(32) except Exception: return None # PNG: 8-byte signature, then IHDR chunk if header[:8] == b"\x89PNG\r\n\x1a\n": w, h = struct.unpack_from(">II", header, 16) return w, h # JPEG: starts with FF D8, scan for SOF marker if header[:2] == b"\xff\xd8": pos = 2 while pos < len(header): if header[pos] != 0xFF: return None marker = header[pos + 1] if marker in (0xC0, 0xC1, 0xC2): h, w = struct.unpack_from(">HH", header, pos + 5) return w, h seg_len = struct.unpack_from(">H", header, pos + 2)[0] pos += 2 + seg_len return None # GIF: "GIF87a" or "GIF89a" if header[:6] in (b"GIF87a", b"GIF89a"): w, h = struct.unpack_from(" int | None: """读取图片的原生宽度(EMU),失败返回 ``None``。 Word 默认以 72 DPI 渲染图片,1 px = 914400 / 72 = 12700 EMU。 """ dims = _get_image_dimensions(image_path) if dims is None: return None w_px, _ = dims return w_px * 12700 def _constrain_width(image_path: str, page_text_width: int) -> int | None: """返回图片宽度(EMU),超出页宽时缩至页宽。 Parameters ---------- image_path : str 图片路径。 page_text_width : int 页面正文区宽度(EMU),来自 ``section.page_width - margins``。 """ native = _get_native_emu(image_path) if native is None: return None return min(native, page_text_width) def insert_image_paragraphs( doc: Document, paragraphs: list[dict], *, idx: int, parent, ): """在 *doc* 的指定位置插入图片段落序列。 每条图片段落生成两个 Word 段落: 1. 居中图片(超出页宽时自动缩放至页宽,否则保持原尺寸) 2. 居中图标题(从 ``alt`` 提取) 插入顺序保持 ``paragraphs`` 的原有顺序。 """ section = doc.sections[0] page_text_width = section.page_width - section.left_margin - section.right_margin for pd_data in reversed(paragraphs): # 图标题(在 reversed 中先插入,最终位于图片下方) cap_p = doc.add_paragraph(pd_data.get("alt", "")) cap_p.alignment = WD_ALIGN_PARAGRAPH.CENTER parent.insert(idx, cap_p._element) # 图片段落(max-width 行为:超出页宽时压缩,否则原尺寸) img_p = doc.add_paragraph() img_p.alignment = WD_ALIGN_PARAGRAPH.CENTER img_run = img_p.add_run() try: img_width = _constrain_width(pd_data["src"], page_text_width) img_run.add_picture(pd_data["src"], width=img_width) except Exception as exc: print(f"警告:图片加载失败 {pd_data['src']} — {exc}") img_run.add_text(f"[图片加载失败: {pd_data['src']}]") parent.insert(idx, img_p._element)