国产探花免费观看_亚洲丰满少妇自慰呻吟_97日韩有码在线_资源在线日韩欧美_一区二区精品毛片,辰东完美世界有声小说,欢乐颂第一季,yy玄幻小说排行榜完本

首頁 > 編程 > Python > 正文

下載糗事百科的內容_python版

2020-02-23 04:41:59
字體:
來源:轉載
供稿:網友
代碼如下:
#coding:utf-8

import urllib.request
import xml.dom.minidom
import sqlite3
import threading
import time

class logger(object):
def log(self,*msg):
for i in msg:
print(i)

Log = logger()
Log.log('測試下')

class downloader(object):

def __init__(self,url):
self.url = url

def download(self):
Log.log('開始下載',self.url)
try:
content = urllib.request.urlopen(self.url).read()
#req = urllib.request.Request(url)
#response = urllib.request.urlopen(req)
#content = response.read()
Log.log('下載完畢')
return(content)
except:
Log.log('下載出錯')
return(None)


class parser(object):

def __init__(self,content):
#獲得根節點
self.html = xml.dom.minidom.parseString(content)

def parse(self):
Log.log('開始提取數據')
contents = {'content':'','url':[]}
#獲得div節點
divs = self.html.getElementsByTagName('div')
#獲得content節點
for div in divs:
if div.hasAttribute('class') and /
div.getAttribute('class') == 'content':
#獲得糗事百科的內容
textNode = div.childNodes[0]
qContent = textNode.data
#數據填充
contents['content'] = qContent

#獲得上一糗事、下一糗事節點
spans = self.html.getElementsByTagName('span')
for span in spans:
pspan = span.parentNode
if pspan.tagName == 'a':
#pspan為對應的鏈接,此時需要將對應的地址加入數據庫
url = pspan.getAttribute('href')
qid = url[10:][:-4]
#數據填充
contents['url'].append(qid)
Log.log('提取數據完畢')
return(contents)

def downloadPage(qid,db):
url = 'http://www.qiushibaike.com/articles/'+str(qid)+'.htm'
content = downloader(url).download()
if content:
contents = parser(content).parse()
if contents['content']:
db.updateContent(qid,contents['content'])
for i in contents['url']:
db.addQID(i)
if len(contents['url']) == 2:
db.updateStatus(qid,2)

#下載池,表示同時允許下載的鏈接個數
class downloaderPool(object):
def __init__(self,maxLength=15):
self.downloaders = [None]*maxLength
self.downloadList = []
self.db = None

def setDownloadList(self,downloadList):
self.downloadList = list(set(self.downloadList+downloadList))

def setdb(self,db):
self.db = db

def daemon(self):
#每隔一秒查詢線程的狀態,為非活動線程則設置為None
Log.log('設置守護進程')
for index,downloader in enumerate(self.downloaders):
if downloader:
if not downloader.isAlive():
Log.log('將下載器置空',index)
self.downloaders[index] = None

#檢查線程池狀態
for index,downloader in enumerate(self.downloaders):
if not downloader:
qid = self.getQID()
if qid:
#創建線程
t = threading.Thread(target=downloadPage,args=(qid,self.db))
self.downloaders[index] = t
t.start()
t.join()
發表評論 共有條評論
用戶名: 密碼:
驗證碼: 匿名發表
主站蜘蛛池模板: 清镇市| 崇礼县| 阳曲县| 新平| 临邑县| 新晃| 突泉县| 图片| 敦煌市| 万年县| 繁峙县| 淮阳县| 天柱县| 虎林市| 望城县| 保靖县| 理塘县| 灵寿县| 三台县| 万盛区| 东山县| 乌海市| 都安| 安图县| 攀枝花市| 邹城市| 涿鹿县| 和龙市| 呼图壁县| 民县| 会同县| 屏东市| 肃北| 个旧市| 黄陵县| 万盛区| 镇坪县| 通辽市| 红桥区| 化州市| 重庆市|