您好,登錄后才能下訂單哦!
小編給大家分享一下如何使用Python實(shí)現(xiàn)爬取亞馬遜數(shù)據(jù)并打印出Excel文件操作,相信大部分人都還不怎么了解,因此分享這篇文章給大家參考一下,希望大家閱讀完這篇文章后大有收獲,下面讓我們一起去了解一下吧!
具體如下:
#!/usr/bin/env python3 # encoding=UTF-8 import sys import re import urllib.request import json import time import zlib from html import unescape import threading import os import xlwt import math import requests #例如這里設(shè)置遞歸為一百萬(wàn) sys.setrecursionlimit(1000000000) ##獲取所有列別 def getProUrl(): urlList = [] headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"} session = requests.Session() furl="https://www.amazon.cn/?tag=baidu250-23&hvadid={creative}&ref=pz_ic_22fvxh5dwf_e&page=" for i in range(0,1): html="" html = session.post(furl+str(i),headers = headers) html.encoding = 'utf-8' s=html.text.encode('gb2312','ignore').decode('gb2312') url=r'</li><li id=".*?" data-asin="(.+?)" class="s-result-item celwidget">' reg=re.compile(url,re.M) name='"category" : "' + '(.*?)' + '"' reg1=re.compile(name,re.S) urlList = reg1.findall(html.text) return urlList ##根據(jù)類(lèi)別獲取數(shù)據(jù)鏈接 def getUrlData(ci): url="https://www.amazon.cn/s/ref=nb_sb_noss_2?__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&url=search-alias%3Daps&field-keywords="+ci+"&page=1&sort=review-rank" return url ##定時(shí)任務(wù),等待1秒在進(jìn)行 def fun_timer(): time.sleep(3) ##根據(jù)鏈接進(jìn)行查詢每個(gè)類(lèi)別的網(wǎng)頁(yè)內(nèi)容 def getProData(allUrlList): webContentHtmlList = [] headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"} for ci in allUrlList: session = requests.Session() fun_timer() html = session.get(getUrlData(ci),headers = headers) # 設(shè)置編碼 html.encoding = 'utf-8' html.text.encode('gb2312', 'ignore').decode('gb2312') gxg = r'</li><li id=".*?" data-asin="(.+?)" class="s-result-item celwidget">' reg = re.compile(gxg, re.M) items = reg.findall(html.text) print(html.text) webContentHtmlList.append(html.text) return webContentHtmlList ##根據(jù)網(wǎng)頁(yè)內(nèi)容過(guò)濾需要的屬性和值 def getProValue(): list1 = [] * 5 list2 = [] * 5 list3 = [] * 5 list4 = [] * 5 list5 = [] * 5 list6 = [] * 5 list7 = [] * 5 list8 = [] * 5 urlList = getProUrl(); urlList.remove('全部分類(lèi)') urlList.remove('Prime會(huì)員優(yōu)先購(gòu)') index = 0 for head in urlList: if index >= 0 and index < 5: list1.append(head) index = index + 1 if index >= 5 and index < 10: list2.append(head) index = index + 1 if index >= 10 and index < 15: list3.append(head) index = index + 1 if index >= 15 and index < 20: list4.append(head) index = index + 1 if index >= 20 and index < 25: list5.append(head) index = index + 1 if index >= 25 and index < 30: list6.append(head) index = index + 1 if index >= 30 and index < 35: list7.append(head) index = index + 1 if index >= 35 and index < 40: list8.append(head) index = index + 1 webContentHtmlList1 = [] webContentHtmlList1 = getProData(list1) webContentHtmlList2 = [] webContentHtmlList2 = getProData(list2) webContentHtmlList3 = [] webContentHtmlList3 = getProData(list3) webContentHtmlList4 = [] webContentHtmlList4 = getProData(list4) webContentHtmlList5 = [] webContentHtmlList5 = getProData(list5) webContentHtmlList6 = [] webContentHtmlList6 = getProData(list6) webContentHtmlList7 = [] webContentHtmlList7 = getProData(list7) webContentHtmlList8 = [] webContentHtmlList8 = getProData(list8) ##存儲(chǔ)所有數(shù)據(jù)的集合 dataTwoAllList1 = [] print("開(kāi)始檢索數(shù)據(jù),檢索數(shù)據(jù)中..........") ##網(wǎng)頁(yè)內(nèi)容1 for html in webContentHtmlList1: for i in range(15): dataList = [] dataList.append(unescape(getProCategory(html,i))) dataList.append(unescape(getProTitle(html,i))) dataList.append(getProPrice(html,i)) dataList.append(getSellerCount(html,i)) dataList.append(getProStar(html,i)) dataList.append(getProCommentCount(html,i)) print(dataList) dataTwoAllList1.append(dataList) ##網(wǎng)頁(yè)內(nèi)容2 for html in webContentHtmlList2: for i in range(15): dataList = [] dataList.append(unescape(getProCategory(html,i))) dataList.append(unescape(getProTitle(html,i))) dataList.append(getProPrice(html,i)) dataList.append(getSellerCount(html,i)) dataList.append(getProStar(html,i)) dataList.append(getProCommentCount(html,i)) print(dataList) dataTwoAllList1.append(dataList) ##網(wǎng)頁(yè)內(nèi)容3 for html in webContentHtmlList3: for i in range(15): dataList = [] dataList.append(unescape(getProCategory(html,i))) dataList.append(unescape(getProTitle(html,i))) dataList.append(getProPrice(html,i)) dataList.append(getSellerCount(html,i)) dataList.append(getProStar(html,i)) dataList.append(getProCommentCount(html,i)) print(dataList) dataTwoAllList1.append(dataList) ##網(wǎng)頁(yè)內(nèi)容4 for html in webContentHtmlList4: for i in range(15): dataList = [] dataList.append(unescape(getProCategory(html,i))) dataList.append(unescape(getProTitle(html,i))) dataList.append(getProPrice(html,i)) dataList.append(getSellerCount(html,i)) dataList.append(getProStar(html,i)) dataList.append(getProCommentCount(html,i)) print(dataList) dataTwoAllList1.append(dataList) ##網(wǎng)頁(yè)內(nèi)容5 for html in webContentHtmlList5: for i in range(15): dataList = [] dataList.append(unescape(getProCategory(html,i))) dataList.append(unescape(getProTitle(html,i))) dataList.append(getProPrice(html,i)) dataList.append(getSellerCount(html,i)) dataList.append(getProStar(html,i)) dataList.append(getProCommentCount(html,i)) print(dataList) dataTwoAllList1.append(dataList) ##網(wǎng)頁(yè)內(nèi)容6 for html in webContentHtmlList6: for i in range(15): dataList = [] dataList.append(unescape(getProCategory(html,i))) dataList.append(unescape(getProTitle(html,i))) dataList.append(getProPrice(html,i)) dataList.append(getSellerCount(html,i)) dataList.append(getProStar(html,i)) dataList.append(getProCommentCount(html,i)) print(dataList) dataTwoAllList1.append(dataList) ##網(wǎng)頁(yè)內(nèi)容7 for html in webContentHtmlList7: for i in range(15): dataList = [] dataList.append(unescape(getProCategory(html,i))) dataList.append(unescape(getProTitle(html,i))) dataList.append(getProPrice(html,i)) dataList.append(getSellerCount(html,i)) dataList.append(getProStar(html,i)) dataList.append(getProCommentCount(html,i)) print(dataList) dataTwoAllList1.append(dataList) ##網(wǎng)頁(yè)內(nèi)容8 for html in webContentHtmlList8: for i in range(15): dataList = [] dataList.append(unescape(getProCategory(html,i))) dataList.append(unescape(getProTitle(html,i))) dataList.append(getProPrice(html,i)) dataList.append(getSellerCount(html,i)) dataList.append(getProStar(html,i)) dataList.append(getProCommentCount(html,i)) print(dataList) dataTwoAllList1.append(dataList) print("檢索數(shù)據(jù)完成?。。?!") print("開(kāi)始保存并打印Excel文檔數(shù)據(jù)?。。。?quot;) ##保存文檔 createTable(time.strftime("%Y%m%d") + '亞馬遜銷(xiāo)量數(shù)據(jù)統(tǒng)計(jì).xls', dataTwoAllList1) ##抽取類(lèi)別 def getProCategory(html,i): i = 0; name = '<span class="a-color-state a-text-bold">' + '(.*?)' + '</span>' reg=re.compile(name,re.S) items = reg.findall(html) if len(items)==0: return "" else: if i<len(items): return items[i] else: return "" ##抽取標(biāo)題 def getProTitle(html,i): html = getHtmlById(html,i) name = '<a class="a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal" target="_blank" title="' + '(.*?)' + '"' reg=re.compile(name,re.S) items = reg.findall(html) if len(items)==0: return "" else: return items[0] ##抽取價(jià)格<a class="a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal" target="_blank" title=" def getProPrice(html,i): html = getHtmlById(html,i) name = '<span class="a-size-base a-color-price s-price a-text-bold">' + '(.*?)' + '</span>' reg=re.compile(name,re.S) items = reg.findall(html) if len(items)==0: return "¥0" else: return items[0] ##抽取賣(mài)家統(tǒng)計(jì) def getSellerCount(html,i): html = getHtmlById(html,i) name = '<span class="a-color-secondary">' + '(.*?)' + '</span>' reg=re.compile(name,re.S) items = reg.findall(html) if len(items)==0: return "(0 賣(mài)家)" else: return checkSellerCount(items,0) ##檢查賣(mài)家統(tǒng)計(jì) def checkSellerCount(items,i): result = items[i].find('賣(mài)家') >= 0 if result: if len(items[i])<=9: return items[i] else: return '(0 賣(mài)家)' else: if i + 1 < len(items): i = i + 1 result = items[i].find('賣(mài)家') >= 0 if result: if len(items[i]) <= 9: return items[i] else: return '(0 賣(mài)家)' if i + 1 < len(items[i]): i = i + 1 result = items[i].find('賣(mài)家') >= 0 if result: if len(items[i]) <= 9: return items[i] else: return '(0 賣(mài)家)' else: return '(0 賣(mài)家)' else: return '(0 賣(mài)家)' else: return '(0 賣(mài)家)' else: return '(0 賣(mài)家)' return '(0 賣(mài)家)' ##抽取星級(jí) <span class="a-icon-alt"> def getProStar(html,i): html = getHtmlById(html,i) name = '<span class="a-icon-alt">' + '(.*?)' + '</span>' reg=re.compile(name,re.S) items = reg.findall(html) if len(items)==0: return "平均 0 星" else: return checkProStar(items,0) ##檢查星級(jí) def checkProStar(items,i): result = items[i].find('星') >= 0 if result: return items[i] else: if i + 1 < len(items): i = i + 1 result = items[i].find('星') >= 0 if result: return items[i] else: return '平均 0 星' else: return '平均 0 星' return '平均 0 星' ##抽取商品評(píng)論數(shù)量 銷(xiāo)量 ##<a class="a-size-small a-link-normal a-text-normal" target="_blank" href="https://www.amazon.cn/dp/B073LBRNV2/ref=sr_1_1?ie=UTF8&qid=1521782688&sr=8-1&keywords=%E5%9B%BE%E4%B9%A6#customerReviews" rel="external nofollow" >56</a> def getProCommentCount(html,i): name = '<a class="a-size-small a-link-normal a-text-normal" target="_blank" href=".*?#customerReviews" rel="external nofollow" ' + '(.*?)' + '</a>' reg=re.compile(name,re.S) items = reg.findall(html) if len(items)==0: return "0" else: if i<len(items): return items[i].strip(">") else: return "0" ##根據(jù)id取出html里面的內(nèi)容 def get_id_tag(content, id_name): id_name = id_name.strip() patt_id_tag = """<[^>]*id=['"]?""" + id_name + """['" ][^>]*>""" id_tag = re.findall(patt_id_tag, content, re.DOTALL|re.IGNORECASE) if id_tag: id_tag = id_tag[0] else: id_tag="" return id_tag ##縮小范圍 定位值 def getHtmlById(html,i): start = get_id_tag(html,"result_"+str(i)) i=i+1 end = get_id_tag(html, "result_" + str(i)) name = start + '.*?'+end reg = re.compile(name, re.S) html = html.strip() items = reg.findall(html) if len(items) == 0: return "" else: return items[0] ##生成word文檔 def createTable(tableName,dataTwoAllList): flag = 1 results = [] results.append("類(lèi)別,標(biāo)題,價(jià)格,賣(mài)家統(tǒng)計(jì),星級(jí),評(píng)論數(shù)") columnName = results[0].split(',') # 創(chuàng)建一個(gè)excel工作簿,編碼utf-8,表格中支持中文 wb = xlwt.Workbook(encoding='utf-8') # 創(chuàng)建一個(gè)sheet sheet = wb.add_sheet('sheet 1') # 獲取行數(shù) rows = math.ceil(len(dataTwoAllList)) # 獲取列數(shù) columns = len(columnName) # 創(chuàng)建格式style style = xlwt.XFStyle() # 創(chuàng)建font,設(shè)置字體 font = xlwt.Font() # 字體格式 font.name = 'Times New Roman' # 將字體font,應(yīng)用到格式style style.font = font # 創(chuàng)建alignment,居中 alignment = xlwt.Alignment() # 居中 alignment.horz = xlwt.Alignment.HORZ_CENTER # 應(yīng)用到格式style style.alignment = alignment style1 = xlwt.XFStyle() font1 = xlwt.Font() font1.name = 'Times New Roman' # 字體顏色(綠色) # font1.colour_index = 3 # 字體加粗 font1.bold = True style1.font = font1 style1.alignment = alignment for i in range(columns): # 設(shè)置列的寬度 sheet.col(i).width = 5000 # 插入列名 for i in range(columns): sheet.write(0, i, columnName[i], style1) for i in range(1,rows): for j in range(0,columns): sheet.write(i, j, dataTwoAllList[i-1][j], style) wb.save(tableName) ##入口開(kāi)始 input("按回車(chē)鍵開(kāi)始導(dǎo)出..........") fun_timer() print("三秒后開(kāi)始抓取數(shù)據(jù).......,請(qǐng)等待!") getProValue(); print("數(shù)據(jù)導(dǎo)出成功!請(qǐng)注意查看!") print("數(shù)據(jù)文檔《亞馬遜銷(xiāo)量數(shù)據(jù)統(tǒng)計(jì).xls》已經(jīng)存于C盤(pán)下面的C:\Windows\SysWOW64的該路徑下面?。。?!") input()
結(jié)果數(shù)據(jù):
打包成exe文件,直接可以點(diǎn)擊運(yùn)行:打包過(guò)程我就不一一說(shuō)了,都是一些命令操作:
要安裝pyinstaller,打成exe的操作命令:--inco是圖標(biāo),路徑和項(xiàng)目當(dāng)前路徑一樣
途中遇到很多問(wèn)題,都一一解決了,亂碼,ip限制,打包后引入模塊找不到,遞歸最大次數(shù),過(guò)濾的一些問(wèn)題
pyinstaller -F -c --icon=my.ico crawling.py
這是打包命令
效果圖:
以上是“如何使用Python實(shí)現(xiàn)爬取亞馬遜數(shù)據(jù)并打印出Excel文件操作”這篇文章的所有內(nèi)容,感謝各位的閱讀!相信大家都有了一定的了解,希望分享的內(nèi)容對(duì)大家有所幫助,如果還想學(xué)習(xí)更多知識(shí),歡迎關(guān)注億速云行業(yè)資訊頻道!
免責(zé)聲明:本站發(fā)布的內(nèi)容(圖片、視頻和文字)以原創(chuàng)、轉(zhuǎn)載和分享為主,文章觀點(diǎn)不代表本網(wǎng)站立場(chǎng),如果涉及侵權(quán)請(qǐng)聯(lián)系站長(zhǎng)郵箱:is@yisu.com進(jìn)行舉報(bào),并提供相關(guān)證據(jù),一經(jīng)查實(shí),將立刻刪除涉嫌侵權(quán)內(nèi)容。