package com.xly.util;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.regex.Pattern;
import java.util.Set;
public class AdvancedSymbolRemover {
// 常用标点符号集合
private static final String CHINESE_PUNCTUATION = "。,、;:?!「」『』()【】《》<>{}〔〕〖〗〘〙〚〛~·…―--- ";
private static final String ENGLISH_PUNCTUATION = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~";
private static final Logger log = LoggerFactory.getLogger(AdvancedSymbolRemover.class);
/**
* 移除所有标点符号(保留字母、数字、中文)
*/
public static String removePunctuationHtml(String text) {
try{
if (text == null || text.isEmpty()) return "";
text = HtmlCleaner.cleanHtml(text);
text = text.replaceAll("br", "");
text = text.replaceAll("
", "");
text = text.replaceAll("", "");
text = text.replaceAll("
", "");
text = text.replaceAll(" ", "");
// 👇 【安全正则】只删除 数字后面的 .0 或 .00
text = text.replaceAll("(?<=\\d)\\.0+(?!\\d)", "");
// 移除中文和英文标点
text = text.replaceAll("[\\pP\\p{Punct}]", "");
// 可选:只保留字母、数字、汉字、空格
text = text.replaceAll("[^\\p{L}\\p{N}\\p{Zs}]", "");
return text;
}catch (Exception e){
}
return text;
}
/**
* 移除所有标点符号(保留字母、数字、中文)
*/
public static String removePunctuation(String text) {
if (text == null || text.isEmpty()) return "";
// 移除中文和英文标点
text = text.replaceAll("[\\pP\\p{Punct}]", "");
// 可选:只保留字母、数字、汉字、空格
text = text.replaceAll("[^\\p{L}\\p{N}\\p{Zs}]", "");
return text;
}
/**
* 智能清理符号(保留必要的分隔符)
*/
public static String cleanSymbolsSmart(String text) {
if (text == null) return "";
// 1. 移除HTML标签和实体
text = removeHtmlTags(text);
// 2. 统一空格
text = text.replaceAll("\\s+", " ");
// 3. 移除多余标点(保留一个)
text = removeDuplicateSymbols(text);
// 4. 清理边缘符号
text = cleanEdgeSymbols(text);
return text.trim();
}
/**
* 移除HTML标签
*/
private static String removeHtmlTags(String text) {
return text.replaceAll("<[^>]*>", "")
.replaceAll("&[a-zA-Z0-9#]+;", "");
}
/**
* 移除连续重复的符号(如!!!变成!)
*/
private static String removeDuplicateSymbols(String text) {
// 处理连续重复的标点
text = text.replaceAll("([!?。,;:])\\1+", "$1");
// 处理连续重复的其他符号
text = text.replaceAll("([-_+=*])\\1+", "$1");
return text;
}
/**
* 清理文本边缘的符号
*/
private static String cleanEdgeSymbols(String text) {
// 移除开头和结尾的符号
text = text.replaceAll("^[\\p{Punct}\\s]+", "");
text = text.replaceAll("[\\p{Punct}\\s]+$", "");
// 移除开头结尾的中文符号
String chinesePunctRegex = "^[" + Pattern.quote(CHINESE_PUNCTUATION) + "\\s]+|" +
"[" + Pattern.quote(CHINESE_PUNCTUATION) + "\\s]+$";
text = text.replaceAll(chinesePunctRegex, "");
return text;
}
/**
* 只保留字母和数字(最严格的清理)
*/
public static String keepOnlyAlphanumeric(String text) {
if (text == null) return "";
// 只保留:字母(包括中文)、数字
return text.replaceAll("[^\\p{L}\\p{N}]", "");
}
/**
* 保留字母、数字和空格
*/
public static String keepAlphanumericAndSpaces(String text) {
if (text == null) return "";
// 保留:字母、数字、空格
return text.replaceAll("[^\\p{L}\\p{N}\\s]", "");
}
/**
* 移除控制字符和不可见字符
*/
public static String removeControlCharacters(String text) {
if (text == null) return "";
// 移除控制字符(0x00-0x1F, 0x7F)
text = text.replaceAll("[\\p{Cntrl}&&[^\r\n\t]]", "");
// 移除Unicode格式字符
text = text.replaceAll("\\p{Cf}", "");
return text;
}
/**
* 移除表情符号和特殊Unicode符号
*/
public static String removeEmojiAndSymbols(String text) {
if (text == null) return "";
// 移除表情符号
text = text.replaceAll("[\\x{1F600}-\\x{1F64F}]", "");
text = text.replaceAll("[\\x{1F300}-\\x{1F5FF}]", "");
text = text.replaceAll("[\\x{1F680}-\\x{1F6FF}]", "");
text = text.replaceAll("[\\x{1F700}-\\x{1F77F}]", "");
// 移除杂项符号和象形文字
text = text.replaceAll("[\\x{1F900}-\\x{1F9FF}]", "");
text = text.replaceAll("[\\x{2600}-\\x{26FF}]", "");
text = text.replaceAll("[\\x{2700}-\\x{27BF}]", "");
return text;
}
/**
* 保留特定符号(白名单方式)
*/
public static String keepSpecificSymbols(String text, Set allowedSymbols) {
if (text == null) return "";
StringBuilder result = new StringBuilder();
for (char c : text.toCharArray()) {
if (Character.isLetterOrDigit(c) ||
Character.isWhitespace(c) ||
allowedSymbols.contains(c)) {
result.append(c);
}
}
return result.toString();
}
/**
* 按类别移除符号
*/
public static String removeByCategory(String text, boolean removePunctuation,
boolean removeDigits, boolean removeSpaces) {
if (text == null) return "";
String regex = "";
if (removePunctuation) {
regex += "\\p{P}";
}
if (removeDigits) {
regex += "\\p{N}";
}
if (removeSpaces) {
regex += "\\s";
}
if (!regex.isEmpty()) {
return text.replaceAll(regex + "]", "");
}
return text;
}
}