TranslationPostProcessingService.java
package com.yumu.noveltranslator.domain.service;
import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.JSONObject;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.nio.charset.StandardCharsets;
import java.time.Duration;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import jakarta.annotation.PreDestroy;
/**
* 翻译后处理服务:检测译文中的残留中文并补充翻译
*/
@Service
@Slf4j
public class TranslationPostProcessingService {
/** 残留中文检测正则:连续 2+ 个中文字符 */
private static final Pattern CHINESE_PATTERN = Pattern.compile("[\u4e00-\u9fff\u3400-\u4dbf]{2,}");
/** \u6b8b\u7559\u4e2d\u6587\u7edd\u5bf9\u4e0a\u9650\uff08\u5b57\u7b26\u6570\uff09\uff0c\u8d85\u8fc7\u6b64\u503c\u653e\u5f03\u540e\u5904\u7406\u4ee5\u907f\u514d\u6210\u672c\u5931\u63a7 */
private static final int MAX_REMEDIAL_CHARS = 500;
/** \u6b8b\u7559\u4e2d\u6587\u76f8\u5bf9\u4e0a\u9650\uff08\u5360\u8bd1\u6587\u603b\u957f\u5ea6\u7684\u767e\u5206\u6bd4\uff09\uff0c\u8d85\u8fc7\u6b64\u503c\u6807\u8bb0\u4e3a\u7ffb\u8bd1\u5931\u8d25 */
private static final double MAX_REMEDIAL_RATIO = 0.15;
private final HttpClient httpClient = HttpClient.newBuilder()
.connectTimeout(Duration.ofSeconds(5))
.build();
@Value("${translation.python.url:http://llm-engine:8000/translate}")
private String pythonTranslateUrl;
@Value("${translation.python.api-key:}")
private String pythonApiKey;
/**
* 检测并修复译文中的残留中文
* @return 修复后的译文
*/
public String fixUntranslatedChinese(String sourceText, String translatedText, String targetLang, String engine) {
// 日语、韩语、中文目标语言使用 CJK 统一表意文字,与中文共享 Unicode 范围,
// 后处理的中文检测会产生大量误报,直接跳过
if (isJapaneseTarget(targetLang) || isKoreanTarget(targetLang) || isChineseTarget(targetLang)) {
log.debug("[后处理] 目标语言为 {},跳过残留中文检测(CJK 文字共享 Unicode 范围)", targetLang);
return translatedText;
}
var segments = detectChineseSegments(translatedText);
if (!segments.isEmpty()) {
int totalChineseChars = segments.stream().mapToInt(String::length).sum();
// 成本熔断:残留中文超过阈值时放弃后处理,避免 API 账单击穿
if (totalChineseChars > MAX_REMEDIAL_CHARS) {
log.warn("[后处理] 残留中文 {} 字超过绝对上限({}),放弃后处理以避免成本失控", totalChineseChars, MAX_REMEDIAL_CHARS);
return translatedText;
}
if (translatedText.length() > 0 && (double) totalChineseChars / translatedText.length() > MAX_REMEDIAL_RATIO) {
log.warn("[后处理] 残留中文占比 {:.1f}% 超过阈值({:.0f}%),译文可能翻译失败,放弃后处理",
(double) totalChineseChars / translatedText.length() * 100, MAX_REMEDIAL_RATIO * 100);
return translatedText;
}
log.info("[后处理] 检测到 {} 段残留中文 (共 {} 字): {}", segments.size(), totalChineseChars, segments);
try {
String remedied = remediateSegments(segments, targetLang, engine);
if (remedied != null) {
translatedText = applyRemediation(translatedText, segments, remedied);
log.info("[后处理] 补救完成,原文长度={}, 修复后长度={}", translatedText.length(), translatedText.length());
}
} catch (Exception e) {
log.warn("[后处理] 补救失败: {},保留原始译文", e.getMessage());
}
}
// 注意:在逐行翻译场景下(streamTextTranslate),行级结构由 TranslationService 处理
// 后处理只负责检测残留中文,不做任何结构重排
return translatedText;
}
/**
* 检测译文中连续的中文字符段
* 日语目标语言使用排除法:日语使用 CJK 统一表意文字(汉字),需排除常见日语汉字
*/
private List<String> detectChineseSegments(String text) {
List<String> segments = new ArrayList<>();
Matcher matcher = CHINESE_PATTERN.matcher(text);
while (matcher.find()) {
segments.add(matcher.group());
}
return segments.stream().distinct().toList();
}
/**
* 判断目标语言是否为日语(日语使用 CJK 统一表意文字,与中文共享 Unicode 范围)
*/
private static boolean isJapaneseTarget(String targetLang) {
return targetLang != null && (targetLang.equalsIgnoreCase("ja") || targetLang.equalsIgnoreCase("japanese"));
}
private static boolean isKoreanTarget(String targetLang) {
return targetLang != null && (targetLang.equalsIgnoreCase("ko") || targetLang.equalsIgnoreCase("korean"));
}
private static boolean isChineseTarget(String targetLang) {
return targetLang != null && (
targetLang.equalsIgnoreCase("zh") ||
targetLang.equalsIgnoreCase("zh-CN") ||
targetLang.equalsIgnoreCase("zh-TW") ||
targetLang.equalsIgnoreCase("chinese"));
}
/**
* 日语常见独有/高频汉字(简化判断:含假名或日语特有汉字则跳过)
* 完整策略:如果片段中包含平假名/片假名,则视为日语而非残留中文
*/
private static final Pattern JAPANESE_KANA = Pattern.compile("[-ゟ゠-ヿㇰ-ㇿ]");
/**
* 判断一个片段是否应视为日语(非残留中文)
*/
private boolean isJapaneseSegment(String segment, String targetLang) {
if (!isJapaneseTarget(targetLang)) {
return false;
}
// 包含假名 → 日语
if (JAPANESE_KANA.matcher(segment).find()) {
return true;
}
// 日语中纯汉字的片段很难区分,但结合目标语言为日语,
// 纯汉字片段很可能是正常的日文汉字,不应视为残留中文
return true;
}
/**
* 调用 LLM 补充翻译残留中文段
*/
private String remediateSegments(List<String> segments, String targetLang, String engine) throws Exception {
String sourceText = String.join("\n", segments);
String baseUrl = pythonTranslateUrl.replace("/translate", "");
String url = baseUrl + "/translate";
// 只发送需要翻译的文本,不加额外指令,避免 LLM 混淆指令和待翻译文本
var body = new LinkedHashMap<String, Object>();
body.put("text", sourceText);
body.put("target_lang", targetLang);
body.put("engine", engine != null ? engine : "openai");
body.put("fallback", true);
String jsonBody = JSON.toJSONString(body);
var reqBuilder = HttpRequest.newBuilder()
.uri(URI.create(url))
.header("Content-Type", "application/json; charset=UTF-8")
.timeout(Duration.ofSeconds(60))
.POST(HttpRequest.BodyPublishers.ofString(jsonBody, StandardCharsets.UTF_8));
if (pythonApiKey != null && !pythonApiKey.isEmpty()) {
reqBuilder.header("X-Service-Key", pythonApiKey);
}
HttpRequest request = reqBuilder.build();
HttpResponse<String> response = httpClient.send(request, HttpResponse.BodyHandlers.ofString());
String responseBody = response.body();
if (responseBody == null || responseBody.isBlank()) {
throw new RuntimeException("Python服务返回空响应体, status=" + response.statusCode());
}
JSONObject json = JSON.parseObject(responseBody);
String data = json.getString("data");
if (data == null || data.isBlank()) {
throw new RuntimeException("补救翻译返回空数据");
}
// 过滤:如果返回结果包含 prompt 关键词(LLM 原样返回了指令文本),视为翻译失败
if (data.contains("以下文本包含") || data.contains("未翻译的中文") || data.contains("请将它们翻译")) {
throw new RuntimeException("补救翻译返回了指令文本,LLM 未正确翻译");
}
return data;
}
/**
* 将补救翻译结果应用到原文
* 关键修复:使用逐个替换,避免 String.replace() 的全局替换导致内容混乱
*/
private String applyRemediation(String translatedText, List<String> segments, String remedied) {
String result = translatedText;
String[] remediedSegments = remedied.split("\n", -1);
for (int i = 0; i < segments.size(); i++) {
String original = segments.get(i);
String translated = (i < remediedSegments.length) ? remediedSegments[i].trim() : original;
if (!translated.isBlank()) {
// 只替换第一次出现,避免全局替换导致内容混乱
int idx = result.indexOf(original);
if (idx >= 0) {
result = result.substring(0, idx) + translated + result.substring(idx + original.length());
}
}
}
return result;
}
@PreDestroy
public void shutdown() {
httpClient.close();
log.info("TranslationPostProcessingService HTTP client closed");
}
}