HtmlCleaner.java
1.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
package com.xly.util;
import org.jsoup.Jsoup;
import org.jsoup.safety.Safelist;
/***
* @Author 钱豹
* @Date 21:22 2026/2/7
* @Param
* @return
* @Description 移除html工具类
**/
public class HtmlCleaner {
// 方法1:移除所有HTML标签,保留文本
public static String removeAllHtml(String html) {
if (html == null) return "";
return Jsoup.parse(html).text();
}
// 方法2:允许特定的简单标签(更安全)
public static String cleanHtml(String html) {
if (html == null) return "";
// 只保留文本和换行
return Jsoup.clean(html,
Safelist.none()
.addTags("br", "p", "div") // 可选:保留特定标签结构
.addAttributes("p", "class")
);
}
// 方法3:保留基本格式
public static String cleanWithBasicFormatting(String html) {
if (html == null) return "";
return Jsoup.clean(html,
Safelist.basic()
.addTags("p", "br", "div")
);
}
public static void main(String[] args) {
String html = "<div><h1>标题</h1><p>这是一段<b>加粗</b>的文字。</p><script>alert('xss')</script></div>";
System.out.println("Jsoup文本提取: " + removeAllHtml(html));
System.out.println("清理HTML: " + cleanHtml(html));
}
}