<dependency>
<groupId>de.l3s.boilerpipe</groupId>
<artifactId>boilerpipe</artifactId>
<!--<version>1.2.0</version>-->
<version>1.1.0</version>
</dependency>
<dependency>
<groupId>xerces</groupId>
<artifactId>xercesImpl</artifactId>
<version>2.9.1</version>
</dependency>
<dependency>
<groupId>net.sourceforge.nekohtml</groupId>
<artifactId>nekohtml</artifactId>
<version>1.9.13</version>
</dependency>
public static String getNewsContent(String html) {
if (StringUtils.isEmpty(html)) return html;
String content = null;
InputStream is = null;
try {
is = new ByteArrayInputStream(html.getBytes());
InputSource inputSource = new InputSource(is);
inputSource.setEncoding("UTF-8"); // 在这里设置你的文本的正确格式
TextDocument textDocument = new BoilerpipeSAXInput(inputSource).getTextDocument();
BoilerpipeExtractor extractor = CommonExtractors.CANOLA_EXTRACTOR;
extractor.process(textDocument);
content = textDocument.getContent();
}catch (Exception e){
e.printStackTrace();
} finally {
if (is!=null){
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return content;
}