package com.xly.util; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.regex.Pattern; import java.util.Set; public class AdvancedSymbolRemover { // 常用标点符号集合 private static final String CHINESE_PUNCTUATION = "。,、;:?!「」『』()【】《》<>{}〔〕〖〗〘〙〚〛~·…―--- "; private static final String ENGLISH_PUNCTUATION = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; private static final Logger log = LoggerFactory.getLogger(AdvancedSymbolRemover.class); /** * 移除所有符号(保留字母、数字、中文、标点) */ public static String removePunctuationHtml(String text) { try{ if (text == null || text.isEmpty()) return ""; text = HtmlCleaner.cleanHtml(text); text = text.replaceAll("br", ""); text = text.replaceAll("
", ""); text = text.replaceAll("", ""); text = text.replaceAll("
", ""); text = text.replaceAll(" ", ""); // 去掉数字末尾无用的 .0 .00 text = text.replaceAll("(?<=\\d)\\.0+(?!\\d)", " "); // 去掉无用文字 text = text.replaceAll("换一换", ""); // 去掉 -,但保留负数 text = text.replaceAll("(?]*>", "") .replaceAll("&[a-zA-Z0-9#]+;", ""); } /** * 移除连续重复的符号(如!!!变成!) */ private static String removeDuplicateSymbols(String text) { // 处理连续重复的标点 text = text.replaceAll("([!?。,;:])\\1+", "$1"); // 处理连续重复的其他符号 text = text.replaceAll("([-_+=*])\\1+", "$1"); return text; } /** * 清理文本边缘的符号 */ private static String cleanEdgeSymbols(String text) { // 移除开头和结尾的符号 text = text.replaceAll("^[\\p{Punct}\\s]+", ""); text = text.replaceAll("[\\p{Punct}\\s]+$", ""); // 移除开头结尾的中文符号 String chinesePunctRegex = "^[" + Pattern.quote(CHINESE_PUNCTUATION) + "\\s]+|" + "[" + Pattern.quote(CHINESE_PUNCTUATION) + "\\s]+$"; text = text.replaceAll(chinesePunctRegex, ""); return text; } /** * 只保留字母和数字(最严格的清理) */ public static String keepOnlyAlphanumeric(String text) { if (text == null) return ""; // 只保留:字母(包括中文)、数字 return text.replaceAll("[^\\p{L}\\p{N}]", ""); } /** * 保留字母、数字和空格 */ public static String keepAlphanumericAndSpaces(String text) { if (text == null) return ""; // 保留:字母、数字、空格 return text.replaceAll("[^\\p{L}\\p{N}\\s]", ""); } /** * 移除控制字符和不可见字符 */ public static String removeControlCharacters(String text) { if (text == null) return ""; // 移除控制字符(0x00-0x1F, 0x7F) text = text.replaceAll("[\\p{Cntrl}&&[^\r\n\t]]", ""); // 移除Unicode格式字符 text = text.replaceAll("\\p{Cf}", ""); return text; } /** * 移除表情符号和特殊Unicode符号 */ public static String removeEmojiAndSymbols(String text) { if (text == null) return ""; // 移除表情符号 text = text.replaceAll("[\\x{1F600}-\\x{1F64F}]", ""); text = text.replaceAll("[\\x{1F300}-\\x{1F5FF}]", ""); text = text.replaceAll("[\\x{1F680}-\\x{1F6FF}]", ""); text = text.replaceAll("[\\x{1F700}-\\x{1F77F}]", ""); // 移除杂项符号和象形文字 text = text.replaceAll("[\\x{1F900}-\\x{1F9FF}]", ""); text = text.replaceAll("[\\x{2600}-\\x{26FF}]", ""); text = text.replaceAll("[\\x{2700}-\\x{27BF}]", ""); return text; } /** * 保留特定符号(白名单方式) */ public static String keepSpecificSymbols(String text, Set allowedSymbols) { if (text == null) return ""; StringBuilder result = new StringBuilder(); for (char c : text.toCharArray()) { if (Character.isLetterOrDigit(c) || Character.isWhitespace(c) || allowedSymbols.contains(c)) { result.append(c); } } return result.toString(); } /** * 按类别移除符号 */ public static String removeByCategory(String text, boolean removePunctuation, boolean removeDigits, boolean removeSpaces) { if (text == null) return ""; String regex = ""; if (removePunctuation) { regex += "\\p{P}"; } if (removeDigits) { regex += "\\p{N}"; } if (removeSpaces) { regex += "\\s"; } if (!regex.isEmpty()) { return text.replaceAll(regex + "]", ""); } return text; } }