python正則匹配抓取豆瓣電影鏈接和評(píng)論代碼分享

2020-02-23 05:02:24

字體：大中小

供稿：網(wǎng)友

代碼如下:
import urllib.request
import re
import time

def movie(movieTag):

    tagUrl=urllib.request.urlopen(url)
    tagUrl_read = tagUrl.read().decode('utf-8')
    return tagUrl_read

def subject(tagUrl_read):

    '''
        這里還存在問題：
        ①這只針對(duì)單獨(dú)的一頁(yè)進(jìn)行排序，而沒有對(duì)全部頁(yè)面的電影進(jìn)行排序
        ②下次更新添加電影鏈接，考慮添加電影海報(bào)
        ③需要追加列表
        ④導(dǎo)入到本地txt或excel中
        ⑤在匹配電影名字時(shí)是否可以同時(shí)匹配鏈接與名字、評(píng)分、評(píng)論組成數(shù)組
        ⑥
    '''
#正則表達(dá)式匹配電影的名字（鏈接）、評(píng)分與評(píng)論
    nameURL = re.findall(r'(http://movie.douban.com/subject/[0-9.]+)//"/s+title="(.+)"',tagUrl_read)
    scoreURL = re.findall(r'<span/s+class="rating_nums">([0-9.]+)<//span>',tagUrl_read)
    evaluateURL = re.findall(r'<span/s+class="pl">/((/w+)人評(píng)價(jià)/)<//span>',tagUrl_read)
    movieLists = list(zip(nameURL,scoreURL,evaluateURL))
    newlist.extend(movieLists)
    return newlist

#用quote處理特殊（中文）字符
movie_type = urllib.request.quote(input('請(qǐng)輸入電影類型(如劇情、喜劇、懸疑)：'))
page_end=int(input('請(qǐng)輸入搜索結(jié)束時(shí)的頁(yè)碼：'))
num_end=page_end*20
num=0
page_num=1
newlist=[]
while num<num_end:
    url=r'http://movie.douban.com/tag/%s?start=%d'%(movie_type,num)
    movie_url = movie(url)
    subject_url=subject(movie_url)
    num=page_num*20
    page_num+=1
else:
    #使用sorted函數(shù)對(duì)列表進(jìn)行排列，reverse參數(shù)為True時(shí)升序，默認(rèn)或False時(shí)為降序， key=lambda還不是很明白這里的原理
    movieLIST = sorted(newlist, key=lambda movieList : movieList[1],reverse = True)
    for movie in movieLIST:
        print(movie)

time.sleep(3)