是否有一个好方法从Java字符串中删除HTML ?一个简单的正则表达式

replaceAll("\\<.*?>", "") 

会起作用,但有些东西像&将不能正确地转换,并且两个尖括号之间的非html将被删除(即。*?在正则表达式中将消失)。


当前回答

我经常发现我只需要去掉注释和脚本元素。这已经为我可靠地工作了15年,可以很容易地扩展到处理HTML或XML中的任何元素名称:

// delete all comments
response = response.replaceAll("<!--[^>]*-->", "");
// delete all script elements
response = response.replaceAll("<(script|SCRIPT)[^+]*?>[^>]*?<(/script|SCRIPT)>", "");

其他回答

这里有一个稍微更充实的更新,试图处理一些格式的休息和列表。我用Amaya的输出作为指导。

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Stack;
import java.util.logging.Logger;

import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;

public class HTML2Text extends HTMLEditorKit.ParserCallback {
    private static final Logger log = Logger
            .getLogger(Logger.GLOBAL_LOGGER_NAME);

    private StringBuffer stringBuffer;

    private Stack<IndexType> indentStack;

    public static class IndexType {
        public String type;
        public int counter; // used for ordered lists

        public IndexType(String type) {
            this.type = type;
            counter = 0;
        }
    }

    public HTML2Text() {
        stringBuffer = new StringBuffer();
        indentStack = new Stack<IndexType>();
    }

    public static String convert(String html) {
        HTML2Text parser = new HTML2Text();
        Reader in = new StringReader(html);
        try {
            // the HTML to convert
            parser.parse(in);
        } catch (Exception e) {
            log.severe(e.getMessage());
        } finally {
            try {
                in.close();
            } catch (IOException ioe) {
                // this should never happen
            }
        }
        return parser.getText();
    }

    public void parse(Reader in) throws IOException {
        ParserDelegator delegator = new ParserDelegator();
        // the third parameter is TRUE to ignore charset directive
        delegator.parse(in, this, Boolean.TRUE);
    }

    public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
        log.info("StartTag:" + t.toString());
        if (t.toString().equals("p")) {
            if (stringBuffer.length() > 0
                    && !stringBuffer.substring(stringBuffer.length() - 1)
                            .equals("\n")) {
                newLine();
            }
            newLine();
        } else if (t.toString().equals("ol")) {
            indentStack.push(new IndexType("ol"));
            newLine();
        } else if (t.toString().equals("ul")) {
            indentStack.push(new IndexType("ul"));
            newLine();
        } else if (t.toString().equals("li")) {
            IndexType parent = indentStack.peek();
            if (parent.type.equals("ol")) {
                String numberString = "" + (++parent.counter) + ".";
                stringBuffer.append(numberString);
                for (int i = 0; i < (4 - numberString.length()); i++) {
                    stringBuffer.append(" ");
                }
            } else {
                stringBuffer.append("*   ");
            }
            indentStack.push(new IndexType("li"));
        } else if (t.toString().equals("dl")) {
            newLine();
        } else if (t.toString().equals("dt")) {
            newLine();
        } else if (t.toString().equals("dd")) {
            indentStack.push(new IndexType("dd"));
            newLine();
        }
    }

    private void newLine() {
        stringBuffer.append("\n");
        for (int i = 0; i < indentStack.size(); i++) {
            stringBuffer.append("    ");
        }
    }

    public void handleEndTag(HTML.Tag t, int pos) {
        log.info("EndTag:" + t.toString());
        if (t.toString().equals("p")) {
            newLine();
        } else if (t.toString().equals("ol")) {
            indentStack.pop();
            ;
            newLine();
        } else if (t.toString().equals("ul")) {
            indentStack.pop();
            ;
            newLine();
        } else if (t.toString().equals("li")) {
            indentStack.pop();
            ;
            newLine();
        } else if (t.toString().equals("dd")) {
            indentStack.pop();
            ;
        }
    }

    public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
        log.info("SimpleTag:" + t.toString());
        if (t.toString().equals("br")) {
            newLine();
        }
    }

    public void handleText(char[] text, int pos) {
        log.info("Text:" + new String(text));
        stringBuffer.append(text);
    }

    public String getText() {
        return stringBuffer.toString();
    }

    public static void main(String args[]) {
        String html = "<html><body><p>paragraph at start</p>hello<br />What is happening?<p>this is a<br />mutiline paragraph</p><ol>  <li>This</li>  <li>is</li>  <li>an</li>  <li>ordered</li>  <li>list    <p>with</p>    <ul>      <li>another</li>      <li>list        <dl>          <dt>This</dt>          <dt>is</dt>            <dd>sdasd</dd>            <dd>sdasda</dd>            <dd>asda              <p>aasdas</p>            </dd>            <dd>sdada</dd>          <dt>fsdfsdfsd</dt>        </dl>        <dl>          <dt>vbcvcvbcvb</dt>          <dt>cvbcvbc</dt>            <dd>vbcbcvbcvb</dd>          <dt>cvbcv</dt>          <dt></dt>        </dl>        <dl>          <dt></dt>        </dl></li>      <li>cool</li>    </ul>    <p>stuff</p>  </li>  <li>cool</li></ol><p></p></body></html>";
        System.out.println(convert(html));
    }
}

您可能希望在剥离HTML之前将<br/>和</p>标记替换为换行符,以防止它像Tim建议的那样变得难以识别。

我能想到的唯一方法是删除HTML标记,但在尖括号之间留下非HTML标记,将检查HTML标记列表。沿着这条线……

replaceAll("\\<[\s]*tag[^>]*>","")

然后html解码特殊字符,如&结果不应该被认为是消毒的。

如果你是为Android编写程序,你可以这样做……

androidx.core.text.HtmlCompat.fromHtml(指令,HtmlCompat.FROM_HTML_MODE_LEGACY) .toString ()

接受的答案并不适用于我所指出的测试用例:“a < b or b > c”的结果是“a b or b > c”。

所以,我用TagSoup代替。下面是一个对我的测试用例(以及其他一些测试用例)有效的示例:

import java.io.IOException;
import java.io.StringReader;
import java.util.logging.Logger;

import org.ccil.cowan.tagsoup.Parser;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;

/**
 * Take HTML and give back the text part while dropping the HTML tags.
 *
 * There is some risk that using TagSoup means we'll permute non-HTML text.
 * However, it seems to work the best so far in test cases.
 *
 * @author dan
 * @see <a href="http://home.ccil.org/~cowan/XML/tagsoup/">TagSoup</a> 
 */
public class Html2Text2 implements ContentHandler {
private StringBuffer sb;

public Html2Text2() {
}

public void parse(String str) throws IOException, SAXException {
    XMLReader reader = new Parser();
    reader.setContentHandler(this);
    sb = new StringBuffer();
    reader.parse(new InputSource(new StringReader(str)));
}

public String getText() {
    return sb.toString();
}

@Override
public void characters(char[] ch, int start, int length)
    throws SAXException {
    for (int idx = 0; idx < length; idx++) {
    sb.append(ch[idx+start]);
    }
}

@Override
public void ignorableWhitespace(char[] ch, int start, int length)
    throws SAXException {
    sb.append(ch);
}

// The methods below do not contribute to the text
@Override
public void endDocument() throws SAXException {
}

@Override
public void endElement(String uri, String localName, String qName)
    throws SAXException {
}

@Override
public void endPrefixMapping(String prefix) throws SAXException {
}


@Override
public void processingInstruction(String target, String data)
    throws SAXException {
}

@Override
public void setDocumentLocator(Locator locator) {
}

@Override
public void skippedEntity(String name) throws SAXException {
}

@Override
public void startDocument() throws SAXException {
}

@Override
public void startElement(String uri, String localName, String qName,
    Attributes atts) throws SAXException {
}

@Override
public void startPrefixMapping(String prefix, String uri)
    throws SAXException {
}
}

或者,可以使用HtmlCleaner:

private CharSequence removeHtmlFrom(String html) {
    return new HtmlCleaner().clean(html).getText();
}