简介:新闻网页正文抽取,可提取互联网上99% 已上文章,智能识别包含的标题及正文内容。 互联网上几百万个站点,每个站点还有N个不同的文章页面模版,您只需要接入我们的接口,就无需再为编写获取文章内容的正则而苦恼了,直接提取标题及正文内容。
已连接应用数:2308
Java调用网页提取接口示例:
public class QueryHelper {
/**
* txt|jsonp|xml
*/
public static String DATATYPE="text";
public static String get(String urlString,String token) {
try {
URL url = new URL(urlString);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setConnectTimeout(5 * 1000);
conn.setReadTimeout(5 * 1000);
conn.setDoInput(true);
conn.setDoOutput(true);
conn.setUseCaches(false);
conn.setInstanceFollowRedirects(false);
conn.setRequestMethod("GET");
conn.setRequestProperty("token",token);
int responseCode = conn.getResponseCode();
if (responseCode == 200) {
StringBuilder builder = new StringBuilder();
BufferedReader br = new BufferedReader(
new InputStreamReader(conn.getInputStream(),"utf-8"));
for (String s = br.readLine(); s != null; s = br
.readLine()) {
builder.append(s);
}
br.close();
return builder.toString();
}
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
public static String queryHtml(String urlString){
String url="https://api.ip138.com/text/?url="+URLEncoder.encode(urlString)+"&type=1";
String token="4d0c2244837514d6f19ca60f4750f20f";
return get(url,token);
}
}
//以下是使用示例:
//QueryHelper.queryHtml("http://www.sina.com.cn/")