Claude API实战：如何高效解析复杂Docx文档结构

1次阅读

共计 3028 个字符，预计需要花费 8 分钟才能阅读完成。

在开始解析之前，我们需要先了解.docx 文件的本质。实际上，一个.docx 文件是一个 ZIP 压缩包，里面包含多个 XML 文件和资源文件。当你解压一个.docx 文件时，会发现以下关键组成部分：

word/document.xml：存储文档的主要内容
word/styles.xml：包含文档的所有样式定义
word/numbering.xml：列表编号信息
word/footnotes.xml：脚注内容
word/headerX.xml和word/footerX.xml：页眉页脚
word/media/：存放所有嵌入的图片

直接解析 XML
优点：完全控制，可以访问所有细节
缺点：代码复杂，需要处理大量 XML 命名空间
使用 python-docx 库
优点：API 友好，快速上手
缺点：某些高级特性支持有限

# 安装必要库
# pip install python-docx anthropic

from typing import List, Dict, Optional
from docx import Document
import zipfile
import xml.etree.ElementTree as ET
from anthropic import Anthropic

def extract_formatted_text(doc_path: str) -> List[Dict]:
    """提取文档中所有段落及其格式信息"""
    doc = Document(doc_path)
    results = []

    for paragraph in doc.paragraphs:
        text = paragraph.text
        formats = []

        # 检查运行 (run) 级别的格式
        for run in paragraph.runs:
            run_format = {
                'text': run.text,
                'bold': run.bold,
                'italic': run.italic,
                'underline': run.underline,
                'font': run.font.name
            }
            formats.append(run_format)

        results.append({
            'paragraph_text': text,
            'formats': formats,
            'style': paragraph.style.name
        })

    return results

def extract_nested_tables(doc_path: str) -> List[List[List[str]]]:
    """提取文档中所有表格数据，支持嵌套表格"""
    doc = Document(doc_path)
    tables_data = []

    for table in doc.tables:
        table_data = []

        for row in table.rows:
            row_data = []

            for cell in row.cells:
                # 处理单元格中的嵌套表格
                if cell.tables:
                    nested_table = extract_nested_tables_from_cell(cell)
                    row_data.append(nested_table)
                else:
                    row_data.append(cell.text)

            table_data.append(row_data)

        tables_data.append(table_data)

    return tables_data

class DocxAnalyzer:
    def __init__(self, api_key: str):
        self.client = Anthropic(api_key=api_key)
        self.chunk_size = 10000  # Claude 的上下文窗口限制

    def analyze_with_claude(self, text: str) -> str:
        """使用 Claude 分析文本内容"""
        prompt = f""" 请分析以下文档内容，提取关键信息并总结：{text}
        """

        response = self.client.completions.create(
            model="claude-2",
            prompt=prompt,
            max_tokens_to_sample=1000
        )

        return response.completion

    def process_large_doc(self, doc_path: str) -> List[str]:
        """分块处理大文档"""
        doc = Document(doc_path)
        full_text = "\n".join([p.text for p in doc.paragraphs])

        chunks = [full_text[i:i+self.chunk_size] 
                 for i in range(0, len(full_text), self.chunk_size)]

        return [self.analyze_with_claude(chunk) for chunk in chunks]

内存管理
使用生成器而非列表存储中间结果
对于超大文档，考虑流式处理

异步处理

import asyncio

async def async_analyze_chunk(chunk: str, analyzer: DocxAnalyzer):
    return await analyzer.analyze_with_claude(chunk)

async def process_doc_async(doc_path: str):
    analyzer = DocxAnalyzer(API_KEY)
    chunks = analyzer.get_chunks(doc_path)

    tasks = [async_analyze_chunk(chunk, analyzer) for chunk in chunks]
    return await asyncio.gather(*tasks)

错误重试机制

from tenacity import retry, stop_after_attempt, wait_exponential

@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def safe_analyze(text: str) -> str:
    try:
        return analyzer.analyze_with_claude(text)
    except Exception as e:
        print(f"分析失败: {e}")
        raise