本文主要分享關于python登錄并爬取淘寶信息的相關代碼,還是挺不錯的,大家可以了解下。
#!/usr/bin/env python # -*- coding:utf-8 -*- from selenium import webdriver import time import datetime import traceback import logging import os from selenium.webdriver.common.action_chains import ActionChains import codecs #登錄 def login(driver,site): driver.get(site) time.sleep(5) try: #點擊請登錄 driver.find_element_by_class_name("h").click() time.sleep(5) #輸入賬號和密碼 driver.find_element_by_id("TPL_username_1").send_keys(u"yourusername") time.sleep(5) #print driver.find_element_by_id("TPL_username_1") driver.find_element_by_id("TPL_password_1").send_keys(u"yourpsd") time.sleep(5) #點擊登錄 driver.find_element_by_id("J_SubmitStatic").click() time.sleep(30) except: print u"failure" def crawlmarket(driver,filename,site): #driver = webdriver.Firefox() driver.get(site) driver.maximize_window() time.sleep(10) driver.refresh() time.sleep(10) test = driver.find_elements_by_xpath("http://a[@class='J_ItemLink']") #是否獲取到消息,若無則登錄 if len(test)==0: login(driver,site) time.sleep(30) resultstrall="" resultstr="" strinfo ="" for i in range(0,len(test),1): if test[i].text != "" : resultstr = test[i].text.strip()+'/n' print resultstr resultstrall += resultstr #是否成功抓取 if resultstrall !="": f = codecs.open(filename,'w','utf-8') f.write(resultstrall) f.close() #若沒有成功抓取將網站寫入error else: strinfo = filename+","+site print strinfo ferror = codecs.open("error.txt",'a','utf-8') ferror.write(strinfo) ferror.close() driver.quit() def crawltaobaosousuo(driver,filename,site): #driver = webdriver.Firefox() driver.get(site) driver.maximize_window() time.sleep(10) driver.get(site) time.sleep(30) driver.refresh() test = driver.find_elements_by_xpath("http://a[@class='J_ClickStat']") resultstrall="" resultstr="" strinfo ="" for i in range(0,len(test),1): if test[i].text != "" : resultstr = test[i].text.strip()+'/n' print resultstr resultstrall += resultstr if resultstrall !="": f = codecs.open(filename,'w','utf-8') f.write(resultstrall) f.close() else: strinfo = filename+","+site print strinfo ferror = codecs.open("error.txt",'a','utf-8') ferror.write(strinfo) ferror.close() driver.quit() def jiexi(driver): f = open("1.txt","r") for line in f: time.sleep(60) info = line.split(",") href = info[1] filename = info[0].decode("utf-8") print filename if "markets" in href: crawlmarket(driver,filename,href) else: crawltaobaosousuo(driver,filename,href) if __name__ =='__main__': driver = webdriver.Firefox() jiexi(driver) 小結
有改進策略一起探討,可以抓取淘寶部分網頁內容,根據自己的需求改改吧,會被風控。個人覺得不登錄的效果更好。
新聞熱點
疑難解答