国产探花免费观看_亚洲丰满少妇自慰呻吟_97日韩有码在线_资源在线日韩欧美_一区二区精品毛片,辰东完美世界有声小说,欢乐颂第一季,yy玄幻小说排行榜完本

首頁(yè) > 學(xué)院 > 開(kāi)發(fā)設(shè)計(jì) > 正文

URLConnection實(shí)現(xiàn)爬蟲(chóng)(解決重定向、設(shè)置cookie才能抓取頁(yè)面等問(wèn)題)

2019-11-08 01:57:26
字體:
來(lái)源:轉(zhuǎn)載
供稿:網(wǎng)友

1.關(guān)鍵方法

/** * 向指定 URL 發(fā)送POST方法的請(qǐng)求 * * @param url * 發(fā)送請(qǐng)求的 URL * @param param * 請(qǐng)求參數(shù),請(qǐng)求參數(shù)應(yīng)該是 name1=value1&name2=value2 的形式。 * @param encode * 請(qǐng)求頁(yè)面的字符編碼 * @param cookie * cookie * @return 所代表遠(yuǎn)程資源的響應(yīng)結(jié)果 */ public static String sendPost1(String url, String param, String encode,String cookie) { PRintWriter out = null; BufferedReader in = null; String result = ""; try { URL realUrl = new URL(url); // 打開(kāi)和URL之間的連接 URLConnection conn = realUrl.openConnection(); // 設(shè)置通用的請(qǐng)求屬性 conn.setRequestProperty("accept", "*/*"); conn.setRequestProperty("Accept-Language","zh-CN,zh;q=0.8"); conn.setRequestProperty("Cache-Control","max-age=0"); conn.setRequestProperty("connection", "Keep-Alive"); conn.setRequestProperty("Cookie",cookie); //conn.setRequestProperty("Host","www.zjtax.gov.cn"); conn.setRequestProperty("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"); // 發(fā)送POST請(qǐng)求必須設(shè)置如下兩行 conn.setDoOutput(true); conn.setDoInput(true); // 獲取URLConnection對(duì)象對(duì)應(yīng)的輸出流 out = new PrintWriter(conn.getOutputStream()); // 發(fā)送請(qǐng)求參數(shù) out.print(param); // flush輸出流的緩沖 out.flush(); // 定義BufferedReader輸入流來(lái)讀取URL的響應(yīng) in = new BufferedReader( new InputStreamReader(conn.getInputStream(),encode)); String line; while ((line = in.readLine()) != null) { result += line; } } catch (Exception e) { System.out.println("發(fā)送 POST 請(qǐng)求出現(xiàn)異常!"+e); e.printStackTrace(); } //使用finally塊來(lái)關(guān)閉輸出流、輸入流 finally{ try{ if(out!=null){ out.close(); } if(in!=null){ in.close(); } } catch(IOException ex){ ex.printStackTrace(); } } return result; } /** * 獲取cookie * * @param url * 發(fā)送請(qǐng)求的URL * @return key=value;key=value;... */ public static String getCookie2(String url) { HttpURLConnection conn = null; try { URL realUrl = new URL(url); conn = (HttpURLConnection) realUrl.openConnection(); conn.setRequestProperty("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); conn.setRequestProperty("Accept-Encoding","gzip, deflate, sdch"); conn.setRequestProperty("Accept-Language","zh-CN,zh;q=0.8"); conn.setRequestProperty("Cache-Control","max-age=0"); conn.setRequestProperty("connection", "Keep-Alive"); //conn.setRequestProperty("Host","www.zjtax.gov.cn"); conn.setRequestProperty("user-agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"); //是否自動(dòng)執(zhí)行 http 重定向,默認(rèn)為true //如果實(shí)際操作中,不存在重定向問(wèn)題,不需要設(shè)置此行。 conn.setInstanceFollowRedirects(false); conn.setDoInput(true); conn.setDoOutput(true); conn.setRequestMethod("POST"); } catch (Exception e) { e.printStackTrace(); } String sessionId = ""; String cookieVal = ""; String key = null; // Map<String,List<String>> map = conn.getHeaderFields();// for (String key1 : map.keySet()) {// System.out.println(key1 + "--->" + map.get(key1));// } //取cookie for(int i = 1; (key = conn.getHeaderFieldKey(i)) != null; i++){ if(key.equalsIgnoreCase("set-cookie")){ cookieVal = conn.getHeaderField(i); cookieVal = cookieVal.substring(0, cookieVal.indexOf(";")); sessionId = sessionId + cookieVal + ";"; } } //如果實(shí)際操作中,不存在重定向問(wèn)題,不需要以下四行 String location= conn.getHeaderField("Location");//獲取 重定向地址 List<String> list = getCookie3(location,sessionId); List<String> list2 = getCookie3(list.get(1),sessionId+list.get(0)); sessionId = sessionId + list2.get(0); return sessionId; } /** * 獲取 cookie * @param url * 發(fā)送請(qǐng)求的URL * @param cookie * cookie */ public static List<String> getCookie3(String url,String cookie) { HttpURLConnection conn = null; try { URL realUrl = new URL(url); conn = (HttpURLConnection) realUrl.openConnection(); conn.setRequestProperty("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); conn.setRequestProperty("Accept-Encoding","gzip, deflate, sdch"); conn.setRequestProperty("Accept-Language","zh-CN,zh;q=0.8"); conn.setRequestProperty("Cache-Control","max-age=0"); conn.setRequestProperty("connection", "Keep-Alive"); //conn.setRequestProperty("Host","www.zjtax.gov.cn"); conn.setRequestProperty("user-agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"); conn.setRequestProperty("Cookie",cookie); conn.setInstanceFollowRedirects(false); conn.setDoInput(true); conn.setDoOutput(true); conn.setRequestMethod("POST"); } catch (Exception e) { e.printStackTrace(); } String sessionId = ""; String cookieVal = ""; String key = null; String location= conn.getHeaderField("Location"); for(int i = 1; (key = conn.getHeaderFieldKey(i)) != null; i++){ if(key.equalsIgnoreCase("set-cookie")){ cookieVal = conn.getHeaderField(i); cookieVal = cookieVal.substring(0, cookieVal.indexOf(";")); sessionId = sessionId + cookieVal + ";"; } } List<String> list = new ArrayList<String>(); list.add(sessionId);//存放cookie list.add(location);//存放重定向地址 return list; }

另附,最基本的get抓取、post抓取、獲取cookie方法

public class HttpURLContent { /** * 向指定URL發(fā)送GET方法的請(qǐng)求 * * @param url * 發(fā)送請(qǐng)求的URL * @param param * 請(qǐng)求參數(shù),請(qǐng)求參數(shù)應(yīng)該是 name1=value1&name2=value2 的形式。 * @return URL 所代表遠(yuǎn)程資源的響應(yīng)結(jié)果 */ public static String sendGet(String url, String param) { String result = ""; BufferedReader in = null; try { String urlNameString = url + "?" + param; URL realUrl = new URL(urlNameString); // 打開(kāi)和URL之間的連接 URLConnection connection = realUrl.openConnection(); // 設(shè)置通用的請(qǐng)求屬性 connection.setRequestProperty("accept", "*/*"); connection.setRequestProperty("connection", "Keep-Alive"); connection.setRequestProperty("user-agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"); // 建立實(shí)際的連接 connection.connect(); // 定義 BufferedReader輸入流來(lái)讀取URL的響應(yīng) in = new BufferedReader(new InputStreamReader( connection.getInputStream())); String line; while ((line = in.readLine()) != null) { result += line; } } catch (Exception e) { System.out.println("發(fā)送GET請(qǐng)求出現(xiàn)異常!" + e); e.printStackTrace(); } // 使用finally塊來(lái)關(guān)閉輸入流 finally { try { if (in != null) { in.close(); } } catch (Exception e2) { e2.printStackTrace(); } } return result; } /** * 向指定 URL 發(fā)送POST方法的請(qǐng)求 * * @param url * 發(fā)送請(qǐng)求的 URL * @param param * 請(qǐng)求參數(shù),請(qǐng)求參數(shù)應(yīng)該是 name1=value1&name2=value2 的形式。 * @return 所代表遠(yuǎn)程資源的響應(yīng)結(jié)果 */ public static String sendPost(String url, String param) { PrintWriter out = null; BufferedReader in = null; String result = ""; try { URL realUrl = new URL(url); // 打開(kāi)和URL之間的連接 URLConnection conn = realUrl.openConnection(); // 設(shè)置通用的請(qǐng)求屬性 conn.setRequestProperty("accept", "*/*"); conn.setRequestProperty("connection", "Keep-Alive"); conn.setRequestProperty("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"); // 發(fā)送POST請(qǐng)求必須設(shè)置如下兩行 conn.setDoOutput(true); conn.setDoInput(true); // 獲取URLConnection對(duì)象對(duì)應(yīng)的輸出流 out = new PrintWriter(conn.getOutputStream()); // 發(fā)送請(qǐng)求參數(shù) out.print(param); // flush輸出流的緩沖 out.flush(); // 定義BufferedReader輸入流來(lái)讀取URL的響應(yīng) in = new BufferedReader( new InputStreamReader(conn.getInputStream())); String line; while ((line = in.readLine()) != null) { result += line; } } catch (Exception e) { System.out.println("發(fā)送 POST 請(qǐng)求出現(xiàn)異常!"+e); e.printStackTrace(); } //使用finally塊來(lái)關(guān)閉輸出流、輸入流 finally{ try{ if(out!=null){ out.close(); } if(in!=null){ in.close(); } } catch(IOException ex){ ex.printStackTrace(); } } return result; } public static String getCookie(String url) { HttpURLConnection conn = null; try { URL realUrl = new URL(url); conn = (HttpURLConnection) realUrl.openConnection(); conn.setDoInput(true); conn.setDoOutput(true); conn.setRequestMethod("POST"); } catch (Exception e) { e.printStackTrace(); } String sessionId = ""; String cookieVal = ""; String key = null; //取cookie for(int i = 1; (key = conn.getHeaderFieldKey(i)) != null; i++){ if(key.equalsIgnoreCase("set-cookie")){ cookieVal = conn.getHeaderField(i); cookieVal = cookieVal.substring(0, cookieVal.indexOf(";")); sessionId = sessionId + cookieVal + ";"; } } return sessionId; }

2.問(wèn)題總結(jié) 第一步:使用最基本方法,直接抓取,抓取到內(nèi)容,恭喜你。

第二步:直接抓取頁(yè)面無(wú)果時(shí),通過(guò)設(shè)置cookie抓取,即conn.setRequestProperty(“Cookie”,cookie);

第三步:新的問(wèn)題是,如何獲取cookie,當(dāng)?shù)谝淮卧L問(wèn)頁(yè)面時(shí)會(huì)產(chǎn)生cookie。所以要先訪問(wèn)一次頁(yè)面,拿到cookie。即getCookie(String url)方法

第四步:這里就比較復(fù)雜了,我接觸的大部分頁(yè)面抓取,目標(biāo)頁(yè)面不存在重定向。如果遇到,就需要使用getCookie2()和getCookie3()方法 獲取cookie。

這也是我目前遇到最麻煩的抓取,用了二天才解決。加油加油加油!!!

3.測(cè)試代碼

/** * 出口退稅率查詢(xún) * 測(cè)試url: * http://www.zjtax.gov.cn/wcm/xchaxun/tuishui.jsp?sotype=FULLNAME&sovalue=鋼鐵&PageIndex=1 */ public HashMap<String,Object> getCktsls(String url){ //先獲取cookie String cookie= HttpURLContent.getCookie2("http://www.zjtax.gov.cn/wcm/xchaxun/tuishui.jsp"); HashMap<String,Object> re = new HashMap<String,Object>(); //抓取結(jié)果 String result = HttpURLContent.sendPost1(url,null,"utf-8",cookie); //System.out.println(result); //以下代碼是對(duì)結(jié)果的處理了。。。根據(jù)實(shí)際情況。。。 if(result.contains("<font color='#104194'>共")){//查詢(xún)到結(jié)果 //總頁(yè)數(shù) String[] result_arr = result.split("<font color='#104194'>共"); String totalPage_str = result_arr[1].substring(0, result_arr[1].indexOf("頁(yè)")).trim(); List<Map<String,String>> mapList = new ArrayList<Map<String,String>>(); String[] result_arr1 = result.split("class=/"gs_cx4_sp7/">"); for(int i=1;i<result_arr1.length;i++){ Map<String,String> map = new HashMap<String,String>(); map.put("number", result_arr1[i].substring(0, result_arr1[i].indexOf("</span>"))); String[] result_arr2 = result_arr1[i].split("/">"); for(int j=1;j<result_arr2.length;j++){ String value = ""; if(j<=5) value = result_arr2[j].substring(0, result_arr2[j].indexOf("</span>")); switch (j) { case 1: map.put("nsrmc",value ); break; case 2: map.put("type", value); break; case 3: map.put("sdate", value); break; case 4: map.put("edate", value); break; case 5: map.put("sign", value); break; default: break; } } mapList.add(map); } re.put("totalPage_str", totalPage_str); re.put("result", mapList); }else{//未查詢(xún)到結(jié)果 } return re; }
發(fā)表評(píng)論 共有條評(píng)論
用戶(hù)名: 密碼:
驗(yàn)證碼: 匿名發(fā)表
主站蜘蛛池模板: 盘山县| 柳江县| 闽侯县| 巨野县| 崇左市| 灵璧县| 嘉峪关市| 沂源县| 博客| 呼和浩特市| 波密县| 镇远县| 江油市| 宣恩县| 海伦市| 元谋县| 宣化县| 射阳县| 托克托县| 深圳市| 循化| 余姚市| 铁力市| 淮阳县| 东乌珠穆沁旗| 五华县| 屯昌县| 精河县| 淮滨县| 邛崃市| 建阳市| 保德县| 昌江| 漳州市| 惠水县| 元朗区| 将乐县| 泰州市| 沾化县| 始兴县| 农安县|