You need to sign in before continuing.
HtmlCleaner.java 1.37 KB
package com.xly.util;

import org.jsoup.Jsoup;
import org.jsoup.safety.Safelist;

/***
 * @Author 钱豹
 * @Date 21:22 2026/2/7
 * @Param
 * @return
 * @Description 移除html工具类
 **/
public class HtmlCleaner {
    // 方法1:移除所有HTML标签,保留文本
    public static String removeAllHtml(String html) {
        if (html == null) return "";
        return Jsoup.parse(html).text();
    }

    // 方法2:允许特定的简单标签(更安全)
    public static String cleanHtml(String html) {
        if (html == null) return "";

        // 只保留文本和换行
        return Jsoup.clean(html,
                Safelist.none()
                        .addTags("br", "p", "div") // 可选:保留特定标签结构
                        .addAttributes("p", "class")
        );
    }

    // 方法3:保留基本格式
    public static String cleanWithBasicFormatting(String html) {
        if (html == null) return "";
        return Jsoup.clean(html,
                Safelist.basic()
                        .addTags("p", "br", "div")
        );
    }

    public static void main(String[] args) {
        String html = "<div><h1>标题</h1><p>这是一段<b>加粗</b>的文字。</p><script>alert('xss')</script></div>";

        System.out.println("Jsoup文本提取: " + removeAllHtml(html));
        System.out.println("清理HTML: " + cleanHtml(html));
    }
}