EntityConsistencyService.java

package com.yumu.noveltranslator.domain.service;

import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.JSONObject;
import com.yumu.noveltranslator.port.dto.translation.EntityExtractionResponse;
import com.yumu.noveltranslator.port.dto.translation.EntityTranslationResponse;
import com.yumu.noveltranslator.port.dto.translation.ConsistencyTranslationResult;
import com.yumu.noveltranslator.port.dto.translation.EntityMapping;
import com.yumu.noveltranslator.domain.model.Glossary;
import com.yumu.noveltranslator.domain.model.UserPreference;
import com.yumu.noveltranslator.port.out.EntityCachePort;
import com.yumu.noveltranslator.port.out.GlossaryRepositoryPort;
import com.yumu.noveltranslator.port.out.UserRepositoryPort;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;

import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.time.Duration;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 实体一致性翻译服务
 * 核心流程:提取实体 → 翻译实体 → 构建占位符映射 → 替换原文 → 翻译 → 还原占位符
 */
@Service
@RequiredArgsConstructor
@Slf4j
public class EntityConsistencyService {

    /** 文本超过此字符数才启用实体一致性 */
    private static final int MIN_TEXT_LENGTH = 500;

    /** 文本超过此字符数时分段提取实体 */
    private static final int SEGMENT_EXTRACTION_THRESHOLD = 5000;
    /** 分段提取的目标段大小(字符数) */
    private static final int ENTITY_SEGMENT_SIZE = 3000;

    /** 禁用代理的 ProxySelector,确保内部 Docker 服务直连 */
    private static final java.net.ProxySelector NO_PROXY_SELECTOR = new java.net.ProxySelector() {
        @Override
        public java.util.List<java.net.Proxy> select(java.net.URI uri) {
            return java.util.List.of(java.net.Proxy.NO_PROXY);
        }
        @Override
        public void connectFailed(java.net.URI uri, java.net.SocketAddress sa, java.io.IOException ioe) {}
    };

    private static final HttpClient HTTP_CLIENT = HttpClient.newBuilder()
            .connectTimeout(Duration.ofSeconds(5))
            .proxy(NO_PROXY_SELECTOR)
            .build();

    @Value("${translation.python.url:http://llm-engine:8000/translate}")
    private String pythonTranslateUrl;

    @Value("${translation.python.api-key:}")
    private String pythonApiKey;

    private final EntityCachePort entityCachePort;
    private final GlossaryRepositoryPort glossaryPort;
    private final UserRepositoryPort userPort;

    /**
     * 判断是否需要启用实体一致性
     */
    public boolean shouldUseConsistency(String text) {
        return text != null && text.length() >= MIN_TEXT_LENGTH;
    }

    /**
     * 带实体一致性的翻译入口
     */
    public ConsistencyTranslationResult translateWithConsistency(
            String sourceText, String targetLang, String engine,
            Long userId, String documentId) {

        ConsistencyTranslationResult result = new ConsistencyTranslationResult();
        result.setConsistencyApplied(false);

        try {
            // 1. 先从文档缓存获取已有的实体映射
            Map<String, String> cachedEntities = entityCachePort.getEntityMap(userId, documentId);

            // 2. 分段提取实体(长文本自动分段)
            List<String> extractedEntities = extractEntitiesSegmented(sourceText, targetLang);

            // 3. 合并术语库:如果用户启用了术语库,提取术语库中的译法
            Map<String, String> glossaryTerms = loadGlossaryTerms(userId, sourceText);
            if (!glossaryTerms.isEmpty()) {
                log.info("术语库匹配到 {} 个实体译法: {}", glossaryTerms.size(), glossaryTerms.keySet());
            }

            if (extractedEntities.isEmpty() && glossaryTerms.isEmpty()) {
                log.info("未提取到实体且术语库无匹配,使用常规翻译");
                return result;
            }
            log.info("提取到 {} 个实体: {}", extractedEntities.size(), extractedEntities);

            // 4. 过滤掉缓存中已有的实体,只翻译新增的
            // 术语库中已有的实体不需要 LLM 翻译
            Set<String> glossaryKeys = glossaryTerms.keySet();
            List<String> newEntities = extractedEntities.stream()
                    .filter(e -> !cachedEntities.containsKey(e) && !glossaryKeys.contains(e))
                    .toList();

            // 构建完整映射:缓存 + 术语库(优先级最高)
            Map<String, String> allTranslations = new LinkedHashMap<>(cachedEntities);
            allTranslations.putAll(glossaryTerms);  // 术语库覆盖缓存中的同名实体

            if (!newEntities.isEmpty()) {
                // 5. 翻译新增实体
                Map<String, String> newTranslations = translateEntities(newEntities, targetLang);
                allTranslations.putAll(newTranslations);
                log.info("翻译了 {} 个新增实体", newTranslations.size());
            }

            // 6. 去重 + 处理嵌套实体(保留最长匹配)
            Map<String, String> dedupedMap = deduplicateEntities(allTranslations, sourceText);

            // 7. 构建占位符映射
            EntityMappingContext context = buildMapping(dedupedMap);

            // 8. 原文中替换实体为占位符
            String textWithPlaceholders = replaceEntitiesWithPlaceholders(sourceText, context);

            // 9. 调用翻译(带占位符保护)
            String translatedWithPlaceholders = translateWithPlaceholders(
                    textWithPlaceholders, targetLang, engine);

            if (translatedWithPlaceholders == null || translatedWithPlaceholders.isBlank()) {
                log.error("带占位符翻译返回为空");
                return result;
            }

            // 10. 译文中将占位符替换为翻译后的实体名
            String finalTranslated = restorePlaceholders(translatedWithPlaceholders, context);

            // 11. 合并到文档缓存(不包含术语库,术语库始终来自 DB)
            Map<String, String> nonGlossaryMap = new LinkedHashMap<>(dedupedMap);
            glossaryKeys.forEach(nonGlossaryMap::remove);
            entityCachePort.mergeEntityMap(userId, documentId, nonGlossaryMap);

            // 构建返回结果
            result.setTranslatedText(finalTranslated);
            result.setMappings(context.mappings);
            result.setOriginalWithPlaceholders(textWithPlaceholders);
            result.setConsistencyApplied(true);

            log.info("实体一致性翻译完成: 实体数={}, 原文长度={}, 译文长度={}",
                    context.mappings.size(), sourceText.length(), finalTranslated.length());

            return result;

        } catch (Exception e) {
            log.error("实体一致性翻译失败: {},降级为常规翻译", e.getMessage(), e);
            return result;
        }
    }

    // ==================== 文本分段与分段实体提取 ====================

    /**
     * 将长文本按段落边界切分为多个片段,用于分段实体提取
     *
     * 规则:
     * - 文本 <= SEGMENT_EXTRACTION_THRESHOLD (5000) 字:不分段,返回单片段
     * - 文本 > 5000 字:按 ENTITY_SEGMENT_SIZE (3000) 字分段
     * - 切分点在段落边界(\n\n)或句子边界(。!?\n)
     * - 不破坏原有文字完整性
     *
     * @param text 原文
     * @return 分段后的文本列表
     */
    public List<String> splitTextForEntityExtraction(String text) {
        if (text == null || text.length() <= SEGMENT_EXTRACTION_THRESHOLD) {
            return List.of(text != null ? text : "");
        }

        List<String> segments = new ArrayList<>();
        String[] paragraphs = text.split("(?<=\n\n)");  // 保留分隔符
        StringBuilder current = new StringBuilder();

        for (String para : paragraphs) {
            if (current.length() + para.length() > ENTITY_SEGMENT_SIZE && current.length() > 0) {
                // 当前段已满,检查是否能在句子边界切分
                String segment = current.toString();
                if (segment.length() > ENTITY_SEGMENT_SIZE * 1.5) {
                    // 如果当前段远超目标大小,尝试在句子边界回退切分
                    segments.addAll(splitAtSentenceBoundary(segment));
                } else {
                    segments.add(segment);
                }
                current = new StringBuilder();
            }
            current.append(para);
        }

        // 处理剩余内容
        if (current.length() > 0) {
            String remaining = current.toString();
            if (remaining.length() > ENTITY_SEGMENT_SIZE * 1.5) {
                segments.addAll(splitAtSentenceBoundary(remaining));
            } else {
                segments.add(remaining);
            }
        }

        // 如果分段后只剩一个空段,返回原文
        if (segments.isEmpty()) {
            return List.of(text);
        }

        log.info("文本分段: 原文{}字, 分为{}段", text.length(), segments.size());
        return segments;
    }

    /**
     * 在句子边界切分超长片段
     * 句子边界:。!?\n
     */
    private List<String> splitAtSentenceBoundary(String text) {
        List<String> parts = new ArrayList<>();
        // 按句子边界切分
        String[] sentences = text.split("(?<=[。!?\n])");
        StringBuilder current = new StringBuilder();

        for (String sentence : sentences) {
            if (current.length() + sentence.length() > ENTITY_SEGMENT_SIZE && current.length() > 0) {
                parts.add(current.toString());
                current = new StringBuilder();
            }
            current.append(sentence);
        }

        if (current.length() > 0) {
            parts.add(current.toString());
        }

        return parts.isEmpty() ? List.of(text) : parts;
    }

    /**
     * 分段提取实体(适用于长文本)
     *
     * 逻辑:
     * 1. 先调用 splitTextForEntityExtraction 分段
     * 2. 对每个分段调用 Python /extract-entities
     * 3. 合并所有分段的实体结果(去重)
     *
     * @param text       原文
     * @param targetLang 目标语言
     * @return 去重后的实体列表
     */
    public List<String> extractEntitiesSegmented(String text, String targetLang) {
        List<String> segments = splitTextForEntityExtraction(text);

        if (segments.size() == 1) {
            // 不分段,直接提取
            try {
                return extractEntities(segments.get(0), targetLang);
            } catch (Exception e) {
                log.warn("实体提取失败: {}", e.getMessage());
                return Collections.emptyList();
            }
        }

        Set<String> allEntities = new LinkedHashSet<>();
        for (int i = 0; i < segments.size(); i++) {
            try {
                List<String> segmentEntities = extractEntities(segments.get(i), targetLang);
                allEntities.addAll(segmentEntities);
                log.debug("实体提取: 第{}/{}段, 提取{}个实体", i + 1, segments.size(), segmentEntities.size());
            } catch (Exception e) {
                log.warn("实体提取: 第{}/{}段失败: {}", i + 1, segments.size(), e.getMessage());
            }
        }

        List<String> result = new ArrayList<>(allEntities);
        log.info("分段实体提取完成: 原文{}字, {}段, 共{}个实体", text.length(), segments.size(), result.size());
        return result;
    }

    // ==================== 原有方法 ====================

    /**
     * 调用 Python /extract-entities 提取实体(公开方法,供外部调用)
     */
    public List<String> extractEntities(String text, String targetLang) throws Exception {
        String baseUrl = pythonTranslateUrl.replace("/translate", "");
        String url = baseUrl + "/extract-entities";

        Map<String, Object> body = new LinkedHashMap<>();
        body.put("text", text);
        body.put("source_lang", "auto");
        body.put("target_lang", targetLang);

        String jsonBody = JSON.toJSONString(body);
        var reqBuilder = HttpRequest.newBuilder()
                .uri(URI.create(url))
                .header("Content-Type", "application/json; charset=UTF-8")
                .timeout(Duration.ofSeconds(30))
                .POST(HttpRequest.BodyPublishers.ofString(jsonBody, StandardCharsets.UTF_8));
        if (pythonApiKey != null && !pythonApiKey.isEmpty()) {
            reqBuilder.header("X-Service-Key", pythonApiKey);
        }
        HttpRequest request = reqBuilder.build();

        HttpResponse<String> response;
        try {
            response = sendWithRetry(request, 2);
        } catch (Exception e) {
            log.warn("实体提取失败,跳过一致性处理: {}", e.getMessage());
            return Collections.emptyList();
        }
        String responseBody = response.body();
        if (responseBody == null || responseBody.isBlank()) {
            log.warn("Python服务返回空响应体, status={}, 跳过实体提取", response.statusCode());
            return Collections.emptyList();
        }

        try {
            EntityExtractionResponse result = JSON.parseObject(responseBody, EntityExtractionResponse.class);
            return result.getEntities() != null ? result.getEntities() : Collections.emptyList();
        } catch (Exception e) {
            log.error("实体提取 JSON 解析失败: 响应前200字符={}", responseBody.length() > 200 ? responseBody.substring(0, 200) : responseBody);
            throw e;
        }
    }

    /**
     * 调用 Python /translate-entities 批量翻译实体(公开方法,供外部调用)
     */
    public Map<String, String> translateEntities(List<String> entities, String targetLang) throws Exception {
        String baseUrl = pythonTranslateUrl.replace("/translate", "");
        String url = baseUrl + "/translate-entities";

        Map<String, Object> body = new LinkedHashMap<>();
        body.put("entities", entities);
        body.put("source_lang", "auto");
        body.put("target_lang", targetLang);

        String jsonBody = JSON.toJSONString(body);
        var reqBuilder = HttpRequest.newBuilder()
                .uri(URI.create(url))
                .header("Content-Type", "application/json; charset=UTF-8")
                .timeout(Duration.ofSeconds(30))
                .POST(HttpRequest.BodyPublishers.ofString(jsonBody, StandardCharsets.UTF_8));
        if (pythonApiKey != null && !pythonApiKey.isEmpty()) {
            reqBuilder.header("X-Service-Key", pythonApiKey);
        }
        HttpRequest request = reqBuilder.build();

        HttpResponse<String> response = sendWithRetry(request, 2);
        String responseBody = response.body();
        if (responseBody == null || responseBody.isBlank()) {
            throw new RuntimeException("Python服务返回空响应体, status=" + response.statusCode());
        }

        try {
            EntityTranslationResponse result = JSON.parseObject(responseBody, EntityTranslationResponse.class);
            return result.getTranslations() != null ? result.getTranslations() : Collections.emptyMap();
        } catch (Exception e) {
            log.error("实体翻译 JSON 解析失败: 响应前200字符={}", responseBody.length() > 200 ? responseBody.substring(0, 200) : responseBody);
            throw e;
        }
    }

    /**
     * 调用 Python /translate-with-placeholders 翻译带占位符的文本
     */
    private String translateWithPlaceholders(String text, String targetLang, String engine) throws Exception {
        String baseUrl = pythonTranslateUrl.replace("/translate", "");
        String url = baseUrl + "/translate-with-placeholders";
        log.info("[一致性翻译] URL={}, text length={}", url, text.length());

        Map<String, Object> body = new LinkedHashMap<>();
        body.put("text", text);
        body.put("target_lang", targetLang);
        body.put("engine", engine != null ? engine : "openai");
        body.put("fallback", true);

        String jsonBody = JSON.toJSONString(body);
        HttpRequest request = HttpRequest.newBuilder()
                .uri(URI.create(url))
                .header("Content-Type", "application/json; charset=UTF-8")
                .timeout(Duration.ofSeconds(60))
                .POST(HttpRequest.BodyPublishers.ofString(jsonBody, StandardCharsets.UTF_8))
                .build();

        log.info("[一致性翻译] 发送请求到 {}", request.uri());
        HttpResponse<String> response = sendWithRetry(request, 2);
        String responseBody = response.body();
        log.info("[一致性翻译] 响应: status={}, body length={}", response.statusCode(), responseBody != null ? responseBody.length() : 0);

        if (responseBody == null || responseBody.isBlank()) {
            throw new RuntimeException("Python服务返回空响应体, status=" + response.statusCode());
        }

        try {
            JSONObject json = JSON.parseObject(responseBody);
            return json.getString("data");
        } catch (Exception e) {
            log.error("占位符翻译 JSON 解析失败: 响应前200字符={}", responseBody.length() > 200 ? responseBody.substring(0, 200) : responseBody);
            throw e;
        }
    }

    /**
     * 去重 + 处理嵌套实体(保留最长匹配)
     */
    private Map<String, String> deduplicateEntities(Map<String, String> translations, String sourceText) {
        // 按实体长度降序排序
        List<Map.Entry<String, String>> sorted = new ArrayList<>(translations.entrySet());
        sorted.sort((a, b) -> Integer.compare(b.getKey().length(), a.getKey().length()));

        Map<String, String> deduped = new LinkedHashMap<>();
        for (Map.Entry<String, String> entry : sorted) {
            String entity = entry.getKey();
            // 检查该实体是否被已添加的更长实体包含
            boolean isSubEntity = false;
            for (String existing : deduped.keySet()) {
                if (existing.contains(entity) && existing.length() > entity.length()) {
                    isSubEntity = true;
                    break;
                }
            }
            if (!isSubEntity && sourceText.contains(entity)) {
                deduped.put(entity, entry.getValue());
            }
        }

        return deduped;
    }

    /**
     * 构建占位符映射上下文(公开方法,供外部调用)
     * 使用 __ENT_<hash>__ 格式占位符,避免 [{N}] 与原文内容碰撞
     */
    public EntityMappingContext buildMapping(Map<String, String> entityTranslations) {
        List<EntityMapping> mappings = new ArrayList<>();
        Map<String, String> entityToPlaceholder = new LinkedHashMap<>();
        int index = 1;

        for (Map.Entry<String, String> entry : entityTranslations.entrySet()) {
            String placeholder = generatePlaceholder(entry.getKey(), entry.getValue(), index);
            EntityMapping mapping = EntityMapping.builder()
                    .sourceText(entry.getKey())
                    .translatedText(entry.getValue())
                    .placeholder(placeholder)
                    .index(index)
                    .build();
            mappings.add(mapping);
            entityToPlaceholder.put(entry.getKey(), placeholder);
            index++;
        }

        return new EntityMappingContext(mappings, entityToPlaceholder);
    }

    /**
     * 生成确定性占位符:__ENT_<8-char-hash>__
     * 相同原文+译文始终生成相同占位符,便于调试和日志追踪。
     */
    private String generatePlaceholder(String source, String translated, int fallbackIndex) {
        try {
            MessageDigest md = MessageDigest.getInstance("SHA-256");
            byte[] hash = md.digest((source + ":" + translated).getBytes(StandardCharsets.UTF_8));
            StringBuilder hex = new StringBuilder(8);
            for (int i = 0; i < 4; i++) {
                hex.append(String.format("%02x", hash[i]));
            }
            return "__ENT_" + hex + "__";
        } catch (Exception e) {
            // SHA-256 不可用时回退到序号格式(极不可能)
            return "__ENT_IDX_" + fallbackIndex + "__";
        }
    }

    /**
     * 原文中替换实体为占位符(公开方法,供外部调用)
     * 注意:按实体长度降序替换,避免短实体先替换导致长实体无法匹配
     */
    public String replaceEntitiesWithPlaceholders(String text, EntityMappingContext context) {
        String result = text;
        // 按实体长度降序
        List<String> sortedEntities = context.entityToPlaceholder.keySet().stream()
                .sorted((a, b) -> Integer.compare(b.length(), a.length()))
                .toList();

        for (String entity : sortedEntities) {
            String placeholder = context.entityToPlaceholder.get(entity);
            // 使用 String.replace 替换所有出现
            result = result.replace(entity, placeholder);
        }

        return result;
    }

    /**
     * 加载用户术语库中在原文中出现的实体译法
     * 术语库优先级高于 LLM 自动翻译
     */
    private Map<String, String> loadGlossaryTerms(Long userId, String sourceText) {
        Map<String, String> terms = new LinkedHashMap<>();

        // 检查用户是否启用了术语库
        try {
            UserPreference pref = userPort.findPreferenceByUserId(userId).orElse(null);
            if (pref == null || !Boolean.TRUE.equals(pref.getEnableGlossary())) {
                log.debug("用户未启用术语库或偏好不存在");
                return terms;
            }
        } catch (Exception e) {
            log.warn("查询用户偏好失败,术语库注入跳过: {}", e.getMessage());
            return terms;
        }

        // 查询用户的所有术语条目
        try {
            List<Glossary> allTerms = glossaryPort.findActiveGlossaryByUserId(userId);

            // 只在原文中实际出现的术语
            for (Glossary term : allTerms) {
                if (term.getSourceWord() != null && sourceText.contains(term.getSourceWord())) {
                    terms.put(term.getSourceWord(), term.getTargetWord());
                }
            }
        } catch (Exception e) {
            log.warn("查询术语库失败: {}", e.getMessage());
        }

        return terms;
    }

    /**
     * 译文中将占位符替换为翻译后的实体名(公开方法,供外部调用)
     * 占位符格式为 __ENT_<hash>__,LLM 几乎不可能破坏此格式
     */
    public String restorePlaceholders(String text, EntityMappingContext context) {
        String result = text;
        for (EntityMapping mapping : context.mappings) {
            String placeholder = mapping.getPlaceholder();
            if (result.contains(placeholder)) {
                result = result.replace(placeholder, mapping.getTranslatedText());
            } else {
                log.warn("占位符还原失败: 未找到 {} (原文={}) 在译文中", placeholder, mapping.getSourceText());
            }
        }
        return result;
    }

    /**
     * 带重试的 HTTP 请求发送
     */
    private HttpResponse<String> sendWithRetry(HttpRequest request, int maxRetries) throws Exception {
        HttpResponse<String> response = null;
        for (int i = 0; i <= maxRetries; i++) {
            try {
                response = HTTP_CLIENT.send(request, HttpResponse.BodyHandlers.ofString());
                if (response.statusCode() == 200) {
                    return response;
                }
                log.warn("HTTP 请求失败 (尝试 {}/{}): status={}", i + 1, maxRetries + 1, response.statusCode());
            } catch (Exception e) {
                log.warn("HTTP 请求异常 (尝试 {}/{}): {}", i + 1, maxRetries + 1, e.getMessage());
            }
            if (i < maxRetries) {
                Thread.sleep(1000L * (i + 1));
            }
        }
        if (response != null) {
            return response;
        }
        throw new RuntimeException("HTTP 请求重试 " + maxRetries + " 次后仍失败");
    }

    /**
     * 占位符映射上下文(公开静态类,供外部使用)
     */
    public record EntityMappingContext(
            List<EntityMapping> mappings,
            Map<String, String> entityToPlaceholder
    ) {}
}