公司有過一個(gè)需求,需要拿一個(gè)網(wǎng)頁的的表格數(shù)據(jù),數(shù)據(jù)量達(dá)到30w左右;為了提高工作效率。
結(jié)合自身經(jīng)驗(yàn)和網(wǎng)上資料。寫了一套符合自己需求的nodejs爬蟲工具。也許也會(huì)適合你的。
先上代碼。在做講解
'use strict';// 引入模塊const superagent = require('superagent');const cheerio = require('cheerio');const Excel = require('exceljs');var baseUrl = '';var Cookies = 'PHPSESSID=1c948cafb361cb5dce87122846e649cd'; //偽裝的cookielet pageDatas = [];let count = 1;let limit = 3;for (count; count < limit; count++) { baseUrl = `http://bxjd.henoo.com/policy/policyList?page=${count}`; loadPage(baseUrl); }function loadPage(baseUrl) { getPageLoad(baseUrl);} async function getPageLoad(baseUrl) { try { let body = await superagent.get(baseUrl) .set("Cookie", Cookies) var $ = cheerio.load(body.text); var trList = $("#tableList").children("tr"); for (var i = 0; i < trList.length; i++) { let item = {}; var tdArr = trList.eq(i).find("td"); var id = tdArr.eq(0).text(); item.sortId = id; var detailUrl = `http://bxjd.henoo.com/policy/view?id=${id}`; item.policyId = tdArr.eq(1).text(); item.policyProductName = tdArr.eq(2).text(); item.policyName = tdArr.eq(3).text(); item.policyMoney = tdArr.eq(4).text(); let detailBody = await superagent.get(detailUrl) .set("Cookie", Cookies); var $$ = cheerio.load(detailBody.text); var detailT = $$(".table-view"); //投保人證件號(hào) item.policyIdNum = detailT.find("tr").eq(11).find("td").eq(1).text(); //投保人手機(jī)號(hào) item.policyPhone = detailT.find("tr").eq(10).find("td").eq(1).text(); //被保人手機(jī)號(hào) item.bePoliciedPhone = detailT.find("tr").eq(16).find("td").eq(1).text(); //被保人姓名 item.bePoliciedName = detailT.find("tr").eq(13).find("td").eq(1).text(); console.log(item.bePoliciedName) //被保人證件號(hào) item.bePoliciedIdNum = detailT.find("tr").eq(17).find("td").eq(1).text(); pageDatas = [...pageDatas,item]; } if (pageDatas.length / 15 == (count - 1)) { writeXLS(pageDatas) } } catch (error) { }}function writeXLS(pageDatas) { const workbook = new Excel.Workbook(); const sheet = workbook.addWorksheet('My Sheet'); const reColumns=[ {header:'序號(hào)',key:'sortId'}, {header:'投保單號(hào)',key:'policyId'}, {header: '產(chǎn)品名稱', key: 'policyProductName'}, {header: '投保人姓名', key: 'policyName' }, {header: '投保人手機(jī)號(hào)', key: 'policyPhone' }, {header: '投保人證件號(hào)', key: 'policyIdNum'}, {header: '被保人姓名', key: 'bePoliciedName' }, {header: '被保人手機(jī)號(hào)', key: 'bePoliciedPhone' }, {header: '被保人證件號(hào)', key: 'bePoliciedIdNum' }, {header:'保費(fèi)',key:'policyMoney'}, ]; sheet.columns = reColumns; for(let trData of pageDatas){ sheet.addRow(trData); } const filename = './projects.xlsx'; workbook.xlsx.writeFile(filename) .then(function() { console.log('ok'); }).catch(function (error) { console.error(error); }); }
新聞熱點(diǎn)
疑難解答
圖片精選