简介:新闻网页正文抽取,可提取互联网上99% 已上文章,智能识别包含的标题及正文内容。 互联网上几百万个站点,每个站点还有N个不同的文章页面模版,您只需要接入我们的接口,就无需再为编写获取文章内容的正则而苦恼了,直接提取标题及正文内容。
已连接应用数:2305
Java调用网页提取接口示例:
public class QueryHelper { /** * txt|jsonp|xml */ public static String DATATYPE="text"; public static String get(String urlString,String token) { try { URL url = new URL(urlString); HttpURLConnection conn = (HttpURLConnection) url.openConnection(); conn.setConnectTimeout(5 * 1000); conn.setReadTimeout(5 * 1000); conn.setDoInput(true); conn.setDoOutput(true); conn.setUseCaches(false); conn.setInstanceFollowRedirects(false); conn.setRequestMethod("GET"); conn.setRequestProperty("token",token); int responseCode = conn.getResponseCode(); if (responseCode == 200) { StringBuilder builder = new StringBuilder(); BufferedReader br = new BufferedReader( new InputStreamReader(conn.getInputStream(),"utf-8")); for (String s = br.readLine(); s != null; s = br .readLine()) { builder.append(s); } br.close(); return builder.toString(); } } catch (IOException e) { e.printStackTrace(); } return null; } public static String queryHtml(String urlString){ String url="https://api.ip138.com/text/?url="+URLEncoder.encode(urlString)+"&type=1"; String token="4d0c2244837514d6f19ca60f4750f20f"; return get(url,token); } } //以下是使用示例: //QueryHelper.queryHtml("http://www.sina.com.cn/")