情感评论编码实战：基于ChatGPT的自动化情感分析解决方案

17次阅读

共计 2195 个字符，预计需要花费 6 分钟才能阅读完成。

人工进行情感评论编码存在三个主要瓶颈：

主观偏差 ：不同标注人员对同一评论可能给出不同情感标签
吞吐量低 ：人工处理速度难以应对每日数千条的用户评论
成本高 ：需要雇佣大量标注人员，人力成本居高不下

与传统 NLP 库相比，ChatGPT 在情感分析任务中的优势：

特性	ChatGPT	TextBlob	VADER
准确率	85-90%	70-75%	75-80%
多语言支持	优秀	一般	仅英文
上下文理解	强	弱	弱

import openai
from typing import Literal

# 建议将 API 密钥存储在环境变量中
openai.api_key = os.getenv("OPENAI_API_KEY")

def analyze_sentiment(text: str) -> Literal["positive", "negative", "neutral"]:
    """使用 ChatGPT 进行情感分析"""
    prompt = f""" 请判断以下文本的情感倾向，仅输出 positive/negative/neutral:
    {text}
    """

    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.3  # 降低随机性
        )
        return response.choices[0].message.content.strip().lower()
    except Exception as e:
        print(f"API 调用失败: {e}")
        return "neutral"  # 失败时默认返回中性

def chunk_text(text: str, max_tokens: int = 3500) -> list[str]:
    """
    将长文本分割为适合 API 处理的片段
    max_tokens 计算逻辑: 
    - gpt-3.5-turbo 最大 4096 tokens
    - 留出 500 tokens 给 prompt 和响应
    """
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        if len(' '.join(current_chunk + [word])) > max_tokens * 0.75:  # 保守估计
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
        else:
            current_chunk.append(word)

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

import aiohttp
import asyncio

async def async_sentiment_analysis(texts: list[str]) -> list[str]:
    """批量处理评论的异步实现"""
    async with aiohttp.ClientSession() as session:
        tasks = []
        for text in texts:
            task = asyncio.create_task(analyze_sentiment_async(session, text)
            )
            tasks.append(task)
        return await asyncio.gather(*tasks)

async def analyze_sentiment_async(session: aiohttp.ClientSession, text: str):
    """单次异步调用封装"""
    # 实现类似同步版本的逻辑
    # ...

import redis
from hashlib import md5

r = redis.Redis(host='localhost', port=6379, db=0)

def get_cached_sentiment(text: str) -> str | None:
    """通过文本哈希查询缓存"""
    text_hash = md5(text.encode()).hexdigest()
    return r.get(f"sentiment:{text_hash}")

def set_cached_sentiment(text: str, label: str):
    """存储分析结果，设置 1 天过期"""
    text_hash = md5(text.encode()).hexdigest()
    r.setex(f"sentiment:{text_hash}", 86400, label)