您好,登錄后才能下訂單哦!
本篇內(nèi)容主要講解“Python怎么實(shí)現(xiàn)爬取騰訊招聘網(wǎng)崗位信息”,感興趣的朋友不妨來看看。本文介紹的方法操作簡單快捷,實(shí)用性強(qiáng)。下面就讓小編來帶大家學(xué)習(xí)“Python怎么實(shí)現(xiàn)爬取騰訊招聘網(wǎng)崗位信息”吧!
開發(fā)環(huán)境
Windows 10
python3.6
開發(fā)工具
pycharm
庫
numpy、matplotlib、time、xlutils.copy、os、xlwt, xlrd, random
代碼運(yùn)行展示
1.打開騰訊招聘的網(wǎng)址右擊檢查進(jìn)行抓包,進(jìn)入網(wǎng)址的時(shí)候發(fā)現(xiàn)有異步渲染,我們要的數(shù)據(jù)為異步加載
2.構(gòu)造起始地址:
start_url = ‘https://careers.tencent.com/tencentcareer/api/post/Query’
參數(shù)在headers的最下面
timestamp: 1625641250509
countryId:
cityId:
bgIds:
productId:
categoryId:
parentCategoryId:
attrId:
keyword:
pageIndex: 1
pageSize: 10
language: zh-cn
area: cn
3.發(fā)送請(qǐng)求,獲取響應(yīng)
self.start_url = 'https://careers.tencent.com/tencentcareer/api/post/Query' # 構(gòu)造請(qǐng)求參數(shù) params = { # 捕捉當(dāng)前時(shí)間戳 'timestamp': str(int(time.time() * 1000)), 'countryId': '', 'cityId': '', 'bgIds': '', 'productId': '', 'categoryId': '', 'parentCategoryId': '', 'attrId': '', 'keyword': '', 'pageIndex': str(self.start_page), 'pageSize': '10', 'language': 'zh-cn', 'area': 'cn' } headers = { 'user-agent': random.choice(USER_AGENT_LIST) } response = session.get(url=self.start_url, headers=headers, params=params).json()
4.提取數(shù)據(jù),獲取崗位信息大列表,提取相應(yīng)的數(shù)據(jù)
# 獲取崗位信息大列表 json_data = response['Data']['Posts'] # 判斷結(jié)果是否有數(shù)據(jù) if json_data is None: # 沒有數(shù)據(jù),設(shè)置循環(huán)條件為False self.is_running = False # 反之,開始提取數(shù)據(jù) else: # 循環(huán)遍歷,取出列表中的每一個(gè)崗位字典 # 通過key取value值的方法進(jìn)行采集數(shù)據(jù) for data in json_data: # 工作地點(diǎn) LocationName = data['LocationName'] # 往地址大列表中添加數(shù)據(jù) self.addr_list.append(LocationName) # 工作屬性 CategoryName = data['CategoryName'] # 往工作屬性大列表中添加數(shù)據(jù) self.category_list.append(CategoryName) # 崗位名稱 RecruitPostName = data['RecruitPostName'] # 崗位職責(zé) Responsibility = data['Responsibility'] # 發(fā)布時(shí)間 LastUpdateTime = data['LastUpdateTime'] # 崗位地址 PostURL = data['PostURL']
5.數(shù)據(jù)生成折線圖、餅圖、散點(diǎn)圖、柱狀圖
# 第一張圖:根據(jù)崗位地址和崗位屬性二者數(shù)量生成折線圖 # 146,147兩行代碼解決圖中中文顯示問題 plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False # 由于二者數(shù)據(jù)數(shù)量不統(tǒng)一,在此進(jìn)行切片操作 x_axis_data = [i for i in addr_dict.values()][:5] y_axis_data = [i for i in cate_dict.values()][:5] # print(x_axis_data, y_axis_data) # plot中參數(shù)的含義分別是橫軸值,縱軸值,線的形狀,顏色,透明度,線的寬度和標(biāo)簽 plt.plot(y_axis_data, x_axis_data, 'ro-', color='#4169E1', alpha=0.8, linewidth=1, label='數(shù)量') # 顯示標(biāo)簽,如果不加這句,即使在plot中加了label='一些數(shù)字'的參數(shù),最終還是不會(huì)顯示標(biāo)簽 plt.legend(loc="upper right") plt.xlabel('地點(diǎn)數(shù)量') plt.ylabel('工作屬性數(shù)量') plt.savefig('根據(jù)崗位地址和崗位屬性二者數(shù)量生成折線圖.png') plt.show()
# 第二張圖:根據(jù)崗位地址數(shù)量生成餅圖 """工作地址餅圖""" addr_dict_key = [k for k in addr_dict.keys()] addr_dict_value = [v for v in addr_dict.values()] plt.rcParams['font.sans-serif'] = ['Microsoft YaHei'] plt.rcParams['axes.unicode_minus'] = False plt.pie(addr_dict_value, labels=addr_dict_key, autopct='%1.1f%%') plt.title(f'崗位地址和崗位屬性百分比分布') plt.savefig(f'崗位地址和崗位屬性百分比分布-餅圖') plt.show()
# 第三張圖:根據(jù)崗位地址和崗位屬性二者數(shù)量生成散點(diǎn)圖 # 這兩行代碼解決 plt 中文顯示的問題 plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False # 輸入崗位地址和崗位屬性數(shù)據(jù) production = [i for i in data.keys()] tem = [i for i in data.values()] colors = np.random.rand(len(tem)) # 顏色數(shù)組 plt.scatter(tem, production, s=200, c=colors) # 畫散點(diǎn)圖,大小為 200 plt.xlabel('數(shù)量') # 橫坐標(biāo)軸標(biāo)題 plt.ylabel('名稱') # 縱坐標(biāo)軸標(biāo)題 plt.savefig(f'崗位地址和崗位屬性散點(diǎn)圖') plt.show()
# 第四張圖:根據(jù)崗位地址和崗位屬性二者數(shù)量生成柱狀圖 import matplotlib;matplotlib.use('TkAgg') plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False zhfont1 = matplotlib.font_manager.FontProperties(fname='C:\Windows\Fonts\simsun.ttc') name_list = [name for name in data.keys()] num_list = [value for value in data.values()] width = 0.5 # 柱子的寬度 index = np.arange(len(name_list)) plt.bar(index, num_list, width, color='steelblue', tick_label=name_list, label='崗位數(shù)量') plt.legend(['分解能耗', '真實(shí)能耗'], prop=zhfont1, labelspacing=1) for a, b in zip(index, num_list): # 柱子上的數(shù)字顯示 plt.text(a, b, '%.2f' % b, ha='center', va='bottom', fontsize=7) plt.xticks(rotation=270) plt.title('崗位數(shù)量和崗位屬性數(shù)量柱狀圖') plt.ylabel('次') plt.legend() plt.savefig(f'崗位數(shù)量和崗位屬性數(shù)量柱狀圖-柱狀圖', bbox_inches='tight') plt.show()
"""ua大列表""" USER_AGENT_LIST = [ 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3451.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:57.0) Gecko/20100101 Firefox/57.0', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.71 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.2999.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.70 Safari/537.36', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.4; en-US; rv:1.9.2.2) Gecko/20100316 Firefox/3.6.2', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.155 Safari/537.36 OPR/31.0.1889.174', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.1.4322; MS-RTC LM 8; InfoPath.2; Tablet PC 2.0)', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 OPR/55.0.2994.61', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.814.0 Safari/535.1', 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; ja-jp) AppleWebKit/418.9.1 (KHTML, like Gecko) Safari/419.3', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0; Touch; MASMJS)', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.21 (KHTML, like Gecko) Chrome/19.0.1041.0 Safari/535.21', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3451.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:57.0) Gecko/20100101 Firefox/57.0', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.71 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.2999.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.70 Safari/537.36', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.4; en-US; rv:1.9.2.2) Gecko/20100316 Firefox/3.6.2', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.155 Safari/537.36 OPR/31.0.1889.174', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.1.4322; MS-RTC LM 8; InfoPath.2; Tablet PC 2.0)', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 OPR/55.0.2994.61', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.814.0 Safari/535.1', 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; ja-jp) AppleWebKit/418.9.1 (KHTML, like Gecko) Safari/419.3', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0; Touch; MASMJS)', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.21 (KHTML, like Gecko) Chrome/19.0.1041.0 Safari/535.21', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4093.3 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko; compatible; Swurl) Chrome/77.0.3865.120 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4086.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:75.0) Gecko/20100101 Firefox/75.0', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) coc_coc_browser/91.0.146 Chrome/85.0.4183.146 Safari/537.36', 'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36 VivoBrowser/8.4.72.0 Chrome/62.0.3202.84', 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.101 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.60', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:83.0) Gecko/20100101 Firefox/83.0', 'Mozilla/5.0 (X11; CrOS x86_64 13505.63.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:68.0) Gecko/20100101 Firefox/68.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.101 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36 OPR/72.0.3815.400', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.101 Safari/537.36', ] from requests_html import HTMLSession import os, xlwt, xlrd, random from xlutils.copy import copy import numpy as np from matplotlib import pyplot as plt from matplotlib.font_manager import FontProperties # 字體庫 import time session = HTMLSession() class TXSpider(object): def __init__(self): # 起始的請(qǐng)求地址 self.start_url = 'https://careers.tencent.com/tencentcareer/api/post/Query' # 起始的翻頁頁碼 self.start_page = 1 # 翻頁條件 self.is_running = True # 準(zhǔn)備工作地點(diǎn)大列表 self.addr_list = [] # 準(zhǔn)備崗位種類大列表 self.category_list = [] def parse_start_url(self): """ 解析起始的url地址 :return: """ # 條件循環(huán)模擬翻頁 while self.is_running: # 構(gòu)造請(qǐng)求參數(shù) params = { # 捕捉當(dāng)前時(shí)間戳 'timestamp': str(int(time.time() * 1000)), 'countryId': '', 'cityId': '', 'bgIds': '', 'productId': '', 'categoryId': '', 'parentCategoryId': '', 'attrId': '', 'keyword': '', 'pageIndex': str(self.start_page), 'pageSize': '10', 'language': 'zh-cn', 'area': 'cn' } headers = { 'user-agent': random.choice(USER_AGENT_LIST) } response = session.get(url=self.start_url, headers=headers, params=params).json() """調(diào)用解析響應(yīng)方法""" self.parse_response_json(response) """翻頁遞增""" self.start_page += 1 """翻頁終止條件""" if self.start_page == 20: self.is_running = False """翻頁完成,開始生成分析圖""" self.crate_img_four_func() def crate_img_four_func(self): """ 生成四張圖方法 :return: """ # 統(tǒng)計(jì)數(shù)量 data = {} # 大字典 addr_dict = {} # 工作地址字典 cate_dict = {} # 工作屬性字典 for k_addr, v_cate in zip(self.addr_list, self.category_list): if k_addr in data: # 大字典統(tǒng)計(jì)工作地址數(shù)據(jù) data[k_addr] = data[k_addr] + 1 # 地址字典統(tǒng)計(jì)數(shù)據(jù) addr_dict[k_addr] = addr_dict[k_addr] + 1 else: data[k_addr] = 1 addr_dict[k_addr] = 1 if v_cate in data: # 大字典統(tǒng)計(jì)工作屬性數(shù)據(jù) data[v_cate] = data[v_cate] + 1 # 工作屬性字典統(tǒng)計(jì)數(shù)據(jù) cate_dict[v_cate] = data[v_cate] + 1 else: data[v_cate] = 1 cate_dict[v_cate] = 1 # 第一張圖:根據(jù)崗位地址和崗位屬性二者數(shù)量生成折線圖 # 146,147兩行代碼解決圖中中文顯示問題 plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False # 由于二者數(shù)據(jù)數(shù)量不統(tǒng)一,在此進(jìn)行切片操作 x_axis_data = [i for i in addr_dict.values()][:5] y_axis_data = [i for i in cate_dict.values()][:5] # print(x_axis_data, y_axis_data) # plot中參數(shù)的含義分別是橫軸值,縱軸值,線的形狀,顏色,透明度,線的寬度和標(biāo)簽 plt.plot(y_axis_data, x_axis_data, 'ro-', color='#4169E1', alpha=0.8, linewidth=1, label='數(shù)量') # 顯示標(biāo)簽,如果不加這句,即使在plot中加了label='一些數(shù)字'的參數(shù),最終還是不會(huì)顯示標(biāo)簽 plt.legend(loc="upper right") plt.xlabel('地點(diǎn)數(shù)量') plt.ylabel('工作屬性數(shù)量') plt.savefig('根據(jù)崗位地址和崗位屬性二者數(shù)量生成折線圖.png') plt.show() # 第二張圖:根據(jù)崗位地址數(shù)量生成餅圖 """工作地址餅圖""" addr_dict_key = [k for k in addr_dict.keys()] addr_dict_value = [v for v in addr_dict.values()] plt.rcParams['font.sans-serif'] = ['Microsoft YaHei'] plt.rcParams['axes.unicode_minus'] = False plt.pie(addr_dict_value, labels=addr_dict_key, autopct='%1.1f%%') plt.title(f'崗位地址和崗位屬性百分比分布') plt.savefig(f'崗位地址和崗位屬性百分比分布-餅圖') plt.show() # 第三張圖:根據(jù)崗位地址和崗位屬性二者數(shù)量生成散點(diǎn)圖 # 這兩行代碼解決 plt 中文顯示的問題 plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False # 輸入崗位地址和崗位屬性數(shù)據(jù) production = [i for i in data.keys()] tem = [i for i in data.values()] colors = np.random.rand(len(tem)) # 顏色數(shù)組 plt.scatter(tem, production, s=200, c=colors) # 畫散點(diǎn)圖,大小為 200 plt.xlabel('數(shù)量') # 橫坐標(biāo)軸標(biāo)題 plt.ylabel('名稱') # 縱坐標(biāo)軸標(biāo)題 plt.savefig(f'崗位地址和崗位屬性散點(diǎn)圖') plt.show() # 第四張圖:根據(jù)崗位地址和崗位屬性二者數(shù)量生成柱狀圖 import matplotlib;matplotlib.use('TkAgg') plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False zhfont1 = matplotlib.font_manager.FontProperties(fname='C:\Windows\Fonts\simsun.ttc') name_list = [name for name in data.keys()] num_list = [value for value in data.values()] width = 0.5 # 柱子的寬度 index = np.arange(len(name_list)) plt.bar(index, num_list, width, color='steelblue', tick_label=name_list, label='崗位數(shù)量') plt.legend(['分解能耗', '真實(shí)能耗'], prop=zhfont1, labelspacing=1) for a, b in zip(index, num_list): # 柱子上的數(shù)字顯示 plt.text(a, b, '%.2f' % b, ha='center', va='bottom', fontsize=7) plt.xticks(rotation=270) plt.title('崗位數(shù)量和崗位屬性數(shù)量柱狀圖') plt.ylabel('次') plt.legend() plt.savefig(f'崗位數(shù)量和崗位屬性數(shù)量柱狀圖-柱狀圖', bbox_inches='tight') plt.show() def parse_response_json(self, response): """ 解析響應(yīng) :param response: :return: """ # 獲取崗位信息大列表 json_data = response['Data']['Posts'] # 判斷結(jié)果是否有數(shù)據(jù) if json_data is None: # 沒有數(shù)據(jù),設(shè)置循環(huán)條件為False self.is_running = False # 反之,開始提取數(shù)據(jù) else: # 循環(huán)遍歷,取出列表中的每一個(gè)崗位字典 # 通過key取value值的方法進(jìn)行采集數(shù)據(jù) for data in json_data: # 工作地點(diǎn) LocationName = data['LocationName'] # 往地址大列表中添加數(shù)據(jù) self.addr_list.append(LocationName) # 工作屬性 CategoryName = data['CategoryName'] # 往工作屬性大列表中添加數(shù)據(jù) self.category_list.append(CategoryName) # 崗位名稱 RecruitPostName = data['RecruitPostName'] # 崗位職責(zé) Responsibility = data['Responsibility'] # 發(fā)布時(shí)間 LastUpdateTime = data['LastUpdateTime'] # 崗位地址 PostURL = data['PostURL'] # 構(gòu)造保存excel所需要的格式字典 data_dict = { # 該字典的key值與創(chuàng)建工作簿的sheet表的名稱所關(guān)聯(lián) '崗位詳情': [RecruitPostName, LocationName, CategoryName, Responsibility, LastUpdateTime, PostURL] } """調(diào)用保存excel表格方法,數(shù)據(jù)字典作為參數(shù)""" self.save_excel(data_dict) # 提示輸出 print(f"第{self.start_page}頁--崗位{RecruitPostName}----采集完成----logging?。?!") def save_excel(self, data_dict): """ 保存excel :param data_dict: 數(shù)據(jù)字典 :return: """ # 判斷保存到當(dāng)我文件目錄的路徑是否存在 os_path_1 = os.getcwd() + '/數(shù)據(jù)/' if not os.path.exists(os_path_1): # 不存在,即創(chuàng)建這個(gè)目錄,即創(chuàng)建”數(shù)據(jù)“這個(gè)文件夾 os.mkdir(os_path_1) # 判斷將數(shù)據(jù)保存到表格的這個(gè)表格是否存在,不存在,創(chuàng)建表格,寫入表頭 os_path = os_path_1 + '騰訊招聘數(shù)據(jù).xls' if not os.path.exists(os_path): # 創(chuàng)建新的workbook(其實(shí)就是創(chuàng)建新的excel) workbook = xlwt.Workbook(encoding='utf-8') # 創(chuàng)建新的sheet表 worksheet1 = workbook.add_sheet("崗位詳情", cell_overwrite_ok=True) excel_data_1 = ('崗位名稱', '工作地點(diǎn)', '工作屬性', '崗位職責(zé)', '發(fā)布時(shí)間', '崗位地址') for i in range(0, len(excel_data_1)): worksheet1.col(i).width = 2560 * 3 # 行,列, 內(nèi)容, 樣式 worksheet1.write(0, i, excel_data_1[i]) workbook.save(os_path) # 判斷工作表是否存在 # 存在,開始往表格中添加數(shù)據(jù)(寫入數(shù)據(jù)) if os.path.exists(os_path): # 打開工作薄 workbook = xlrd.open_workbook(os_path) # 獲取工作薄中所有表的個(gè)數(shù) sheets = workbook.sheet_names() for i in range(len(sheets)): for name in data_dict.keys(): worksheet = workbook.sheet_by_name(sheets[i]) # 獲取工作薄中所有表中的表名與數(shù)據(jù)名對(duì)比 if worksheet.name == name: # 獲取表中已存在的行數(shù) rows_old = worksheet.nrows # 將xlrd對(duì)象拷貝轉(zhuǎn)化為xlwt對(duì)象 new_workbook = copy(workbook) # 獲取轉(zhuǎn)化后的工作薄中的第i張表 new_worksheet = new_workbook.get_sheet(i) for num in range(0, len(data_dict[name])): new_worksheet.write(rows_old, num, data_dict[name][num]) new_workbook.save(os_path) def run(self): """ 啟動(dòng)運(yùn)行 :return: """ self.parse_start_url() if __name__ == '__main__': # 創(chuàng)建該類的對(duì)象 t = TXSpider() # 通過實(shí)例方法,進(jìn)行調(diào)用 t.run()
到此,相信大家對(duì)“Python怎么實(shí)現(xiàn)爬取騰訊招聘網(wǎng)崗位信息”有了更深的了解,不妨來實(shí)際操作一番吧!這里是億速云網(wǎng)站,更多相關(guān)內(nèi)容可以進(jìn)入相關(guān)頻道進(jìn)行查詢,關(guān)注我們,繼續(xù)學(xué)習(xí)!
免責(zé)聲明:本站發(fā)布的內(nèi)容(圖片、視頻和文字)以原創(chuàng)、轉(zhuǎn)載和分享為主,文章觀點(diǎn)不代表本網(wǎng)站立場,如果涉及侵權(quán)請(qǐng)聯(lián)系站長郵箱:is@yisu.com進(jìn)行舉報(bào),并提供相關(guān)證據(jù),一經(jīng)查實(shí),將立刻刪除涉嫌侵權(quán)內(nèi)容。