Files
md2word/transit/images.py
zzy 4e39a4f2ac feat(transit): 添加正文引用标记到书签超链接功能
- 新增 CITE_PATTERN 正则表达式匹配 [N] 引用格式
- 添加 base_dir 参数支持相对图片路径解析
- 实现书签创建和超链接替换功能
- 添加 link_body_citations 函数处理正文引用链接
- 在参考文献段落中添加书签标识
- 支持将 [N] 引用替换为指向参考文献的超链接
2026-05-10 15:07:20 +08:00

164 lines
5.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
图片处理模块。
将 Markdown 中的 ``<img>`` 标签解析为独立段落,
并在 Word 文档中插入图片及居中图标题。
"""
import struct
import re
from pathlib import Path
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
# 匹配 <img src="..." alt="...">
_IMG_TAG = re.compile(
r'<img\s+([^>]*?)src=["\']([^"\']+)["\']([^>]*?)>', re.IGNORECASE
)
# 从标签属性块中提取名值对
_ATTR = re.compile(r'(\w+)\s*=\s*["\']([^"\']*)["\']')
# 匹配 **图X ...** 重复图标题(注入时跳过)
_FIG_CAPTION = re.compile(
r'^\*\*(?:图|Table|Figure|Fig\.?)\s*\d+.*\*\*$', re.IGNORECASE
)
def make_image_paragraph(block: str, base_dir: str | Path | None = None) -> dict | None:
"""若 *block* 包含 ``<img>`` 标签,返回图片段落字典;否则返回 ``None``。
Parameters
----------
block : str
文本块。
base_dir : str | Path | None
Markdown 文件所在目录,用于解析图片相对路径。
"""
m = _IMG_TAG.search(block)
if not m:
return None
attrs = dict(_ATTR.findall(block))
src = attrs.get("src", m.group(2))
if base_dir and not Path(src).is_absolute():
src = str(Path(base_dir) / src)
return {
"type": "image",
"src": src,
"alt": attrs.get("alt", ""),
}
def is_figure_caption(block: str) -> bool:
"""检查 *block* 是否为 ``**图X ...**`` 格式的重复图标题。"""
return bool(_FIG_CAPTION.match(block.strip()))
def _get_image_dimensions(image_path: str) -> tuple[int, int] | None:
"""读取图片文件头返回 ``(width_px, height_px)``,不支持格式返回 ``None``。
仅读取文件头,不依赖第三方库。支持 PNG / JPEG / GIF / BMP。
"""
try:
with open(image_path, "rb") as f:
header = f.read(32)
except Exception:
return None
# PNG: 8-byte signature, then IHDR chunk
if header[:8] == b"\x89PNG\r\n\x1a\n":
w, h = struct.unpack_from(">II", header, 16)
return w, h
# JPEG: starts with FF D8, scan for SOF marker
if header[:2] == b"\xff\xd8":
pos = 2
while pos < len(header):
if header[pos] != 0xFF:
return None
marker = header[pos + 1]
if marker in (0xC0, 0xC1, 0xC2):
h, w = struct.unpack_from(">HH", header, pos + 5)
return w, h
seg_len = struct.unpack_from(">H", header, pos + 2)[0]
pos += 2 + seg_len
return None
# GIF: "GIF87a" or "GIF89a"
if header[:6] in (b"GIF87a", b"GIF89a"):
w, h = struct.unpack_from("<HH", header, 6)
return w, h
# BMP: "BM" signature
if header[:2] == b"BM":
w, h = struct.unpack_from("<ii", header, 18)
return w, abs(h)
return None
def _get_native_emu(image_path: str) -> int | None:
"""读取图片的原生宽度EMU失败返回 ``None``。
Word 默认以 72 DPI 渲染图片1 px = 914400 / 72 = 12700 EMU。
"""
dims = _get_image_dimensions(image_path)
if dims is None:
return None
w_px, _ = dims
return w_px * 12700
def _constrain_width(image_path: str, page_text_width: int) -> int | None:
"""返回图片宽度EMU超出页宽时缩至页宽。
Parameters
----------
image_path : str
图片路径。
page_text_width : int
页面正文区宽度EMU来自 ``section.page_width - margins``。
"""
native = _get_native_emu(image_path)
if native is None:
return None
return min(native, page_text_width)
def insert_image_paragraphs(
doc: Document,
paragraphs: list[dict],
*,
idx: int,
parent,
):
"""在 *doc* 的指定位置插入图片段落序列。
每条图片段落生成两个 Word 段落:
1. 居中图片(超出页宽时自动缩放至页宽,否则保持原尺寸)
2. 居中图标题(从 ``alt`` 提取)
插入顺序保持 ``paragraphs`` 的原有顺序。
"""
section = doc.sections[0]
page_text_width = section.page_width - section.left_margin - section.right_margin
for pd_data in reversed(paragraphs):
# 图标题(在 reversed 中先插入,最终位于图片下方)
cap_p = doc.add_paragraph(pd_data.get("alt", ""))
cap_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
parent.insert(idx, cap_p._element)
# 图片段落max-width 行为:超出页宽时压缩,否则原尺寸)
img_p = doc.add_paragraph()
img_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
img_run = img_p.add_run()
try:
img_width = _constrain_width(pd_data["src"], page_text_width)
img_run.add_picture(pd_data["src"], width=img_width)
except Exception as exc:
print(f"警告:图片加载失败 {pd_data['src']}{exc}")
img_run.add_text(f"[图片加载失败: {pd_data['src']}]")
parent.insert(idx, img_p._element)