記錄抓取某直聘網(wǎng)站

發(fā)布時間：2020-10-10 11:41:05 來源：網(wǎng)絡(luò) 閱讀：157 作者：Gendan5 欄目：編程語言

近期有朋友讓我?guī)妥ヒ幌履硞€直聘網(wǎng)站的招聘崗位，閑來無事就試了一下。

考慮到這種網(wǎng)站肯定是有反爬機制，于是使用Selenium+Chrome的方式抓取

用到的主要工具：

python3.5

selenium

scrapy

由于[網(wǎng)站的數(shù)據(jù)跟單(http://www.gendan5.com/tech.html)是可以按照地市來查詢的，所以先訪問該網(wǎng)站支持的城市劃分

使用scrapy的self.start_urls進行請求

self.start_urls = ['https://www.zhipin.com/wapi/zpCommon/data/city.json',]

同時使用selenium請求該網(wǎng)站主頁

self.driver.get('https://www.zhipin.com/')

后來發(fā)現(xiàn)網(wǎng)站可以識別selenium，不返回數(shù)據(jù)，于是添加

options = webdriver.ChromeOptions()

options.add_experimental_option('excludeSwitches', ['enable-automation'])

self.driver = webdriver.Chrome(options=options)

將程序設(shè)置為開發(fā)者模式，數(shù)據(jù)可以正常請求到

接下來就是解析支持搜索的城市名，并且匯總成我們能使用的數(shù)據(jù)格式

    dic = {}

    json_text = json.loads(response.text)['zpData']['cityList']

    for i in range(len(json_text)):

        # 獲取到各個省的名稱，并且作為字典的鍵名賦值

        province = json_text[i]['name']

        provinces = json_text[i]['subLevelModelList']

        dic.setdefault(province,[])

        citys = []

        # 分類直轄市和地級市，并歸類到字典的值

        if provinces.__len__() > 1:

            for ii in range(len(provinces)):

                city = provinces[ii]['name']

                citys.append(city)

        else:

            city = province

            citys.append(city)

        dic[province] = citys

準備工作完成了，接下來就是請求數(shù)據(jù)了

    self.driver.find_element_by_xpath('//*[@id="wrap"]/div[3]/div/div/div[1]/form/div[2]/p/input').send_keys('需要查詢的崗位') # 主頁搜索框，過度用

    sleep(2)

    self.driver.find_element_by_xpath('//*[@id="wrap"]/div[3]/div/div/div[1]/form/button').click()

    sleep(2)

到這里，程序算是進入了正軌，直接貼上代碼。如下：

-- coding: utf-8 --

import scrapy

import json

import re

from scrapy.spiders import CrawlSpider

from time import sleep

from ..items import ZhaopinBossZhipinItem

from scrapy.selector import Selector

import importlib

import random

from selenium import webdriver

import sys

importlib.reload(sys)

class ZP_boss(CrawlSpider):

boss > 各地口腔招聘

name = "boss"

custom_settings = {

    'ITEM_PIPELINES': {'zhaopin_bosszhipin.pipelines.ZhaopinBossPipeline': 300, },

    'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': 1,

    'DOWNLOAD_DELAY': 0.5,

                           'MYEXT_ENABLED': True

                        }

def __init__(self,):

    super(ZP_boss,self).__init__()

    self.allowed_domains = ["https://www.baidu.com"] # 過濾的url

    self.start_urls = ['https://www.zhipin.com/wapi/zpCommon/data/city.json',] # 訪問網(wǎng)頁支持搜索的城市

    options = webdriver.ChromeOptions()

    options.add_experimental_option('excludeSwitches', ['enable-automation'])

    self.driver = webdriver.Chrome(options=options)

    self.driver.maximize_window() # 瀏覽器設(shè)置成頁面最大化

    self.driver.get('https://www.zhipin.com/')

def parse(self, response):

    dic = {}

    json_text = json.loads(response.text)['zpData']['cityList']

    for i in range(len(json_text)):

        # 獲取到各個省的名稱，并且作為字典的鍵名賦值

        province = json_text[i]['name']

        provinces = json_text[i]['subLevelModelList']

        dic.setdefault(province,[])

        citys = []

        # 分類直轄市和地級市，并歸類到字典的值

        if provinces.__len__() > 1:

            for ii in range(len(provinces)):

                city = provinces[ii]['name']

                citys.append(city)

        else:

            city = province

            citys.append(city)

        dic[province] = citys

    self.driver.find_element_by_xpath('//*[@id="wrap"]/div[3]/div/div/div[1]/form/div[2]/p/input').send_keys('python') # 主頁搜索框，過度用

    sleep(2)

    self.driver.find_element_by_xpath('//*[@id="wrap"]/div[3]/div/div/div[1]/form/button').click()

    sleep(2)

    for prov in dic.keys(): # 循環(huán)抓取到的省名

        cts = dic[prov] # 單個省或者直轄市包含的所有城市

        for ct in cts: # 單個城市名

            query = '搜索的崗位'+ct  

            self.driver.find_element_by_xpath('//p[@class="ipt-wrap"]/input[@name="query"]').clear()

            # sleep(0.1)

            self.driver.find_element_by_xpath('//p[@class="ipt-wrap"]/input[@name="query"]').send_keys(query)

            sleep(0.2)

            self.driver.find_element_by_xpath('//button[@class="btn btn-search"]').click() # 點擊查詢數(shù)據(jù)

            sleep(1)

            # source = Selector(text=self.driver.page_source)

            panduan = True

            while panduan: # 循環(huán)翻頁

                sou = Selector(text=self.driver.page_source)

                link_lens = sou.xpath('//*[@id="main"]/div/div[2]/ul/li').extract() # 獲取當(dāng)前頁面所有的li標簽，一個標簽就是一條招聘數(shù)據(jù)

                # 分解出當(dāng)前頁面每一個li標簽，并獲取到部分數(shù)據(jù)

                for link_text in link_lens:

                    sel = Selector(text=link_text)

                    # 招聘單位

                    company = ''.join(sel.xpath('//div[@class="company-text"]/h4/a/text()').extract()).strip()

                    # 城市

                    city = ct

                    # 學(xué)歷要求

                    education = ''.join(sel.xpath('//div[@class="info-primary"]/p/text()[3]').extract()).strip()

                    # 工作經(jīng)驗

                    experience = ''.join(sel.xpath('//div[@class="info-primary"]/p/text()[2]').extract()).strip()

                    # 獲取數(shù)據(jù)的城市地址

                    adrs_text = sel.xpath('//p/text()').extract()

                    if adrs_text:  # 加這個判斷是為了保證有城市數(shù)據(jù)，有時候網(wǎng)頁會抽風(fēng)導(dǎo)致 下標越界或空對象沒有g(shù)roup()方法的錯

                        adrs = re.search('(\w+?)\s',''.join(adrs_text[0])).group().strip() # 匹配出當(dāng)前招聘所在城市名

                        if adrs != ct:  # 如果沒有匹配數(shù)據(jù)，網(wǎng)站會把該省的其他市數(shù)據(jù)返回，篩選掉這部分數(shù)據(jù),只做精準匹配

                            panduan = False

                            continue

                        else:

                            pass

                        are = re.search('\s(\w+?)\s',''.join(adrs_text[0])) # 城市的區(qū)

                        if are:

                            area = are.group().strip()

                        else:

                            area = ''

                        main_url = 'https://www.zhipin.com'

                        link_href = ''.join(sel.xpath('//div[@class="info-primary"]/h4[@class="name"]/a/@href').extract()).strip()

                        url = main_url + link_href

                        # 獲取詳情頁的索引值

                        href_index = ''.join(sel.xpath('//div[@class="info-primary"]/h4[@class="name"]/a/@data-index').extract()).strip()

                        # 點擊進入詳情頁

                        link_page = self.driver.find_element_by_xpath('//div[@class="info-primary"]/h4/a[@data-index="{}"]/div[@class="job-title"]'.format(href_index))

                        link_page.click()

                        # driver切換到新頁面，獲取詳情頁數(shù)據(jù)

                        n = self.driver.window_handles  # 獲取到所有窗口，返回的是一個list，下標從0開始

                        self.driver.switch_to.window(n[1])  # 切換到新的網(wǎng)頁窗口視圖，driver的page_source也會更改成新頁面的

                        sleep(1)

                        se = Selector(text=self.driver.page_source)

                        # 崗位

                        job_name = ''.join(se.xpath('//div[@class="name"]/h2/text()').extract()).strip()

                        # 薪資

                        salary  = ''.join(se.xpath('//div[@class="name"]/span[@class="salary"]/text()').extract()).strip()

                        # 福利

                        welfare = ';'.join(se.xpath('//*[@id="main"]/div[1]/div/div/div[2]/div[3]/div[2]/span/text()').extract()).strip()

                        # 發(fā)布時間

                        publishtime = ''.join(re.findall('\d+.*',''.join(se.xpath('//*[@id="main"]/div[3]/div/div[1]/div[2]/p[@class="gray"]/text()').extract()))).strip()

                        # 崗位職責(zé)

                        Duty = ''.join(se.xpath('//*[@id="main"]/div[3]/div/div[2]/div[2]/div[1]/div[@class="text"]').extract()).strip()

                        # 詳細地址

                        address = ''.join(se.xpath('//*[@id="main"]/div[3]/div/div[2]/div[2]/div/div[@class="job-location"]/div[@class="location-address"]/text()').extract()).strip()

                        print('發(fā)布時間:',publishtime)

                        print('崗位名稱:',job_name)

                        print('招聘單位:',company)

                        print('學(xué)歷要求:',education)

                        print('工作經(jīng)驗:',experience)

                        print('薪資:',salary)

                        print('福利:',welfare)

                        print('地址:',address)

                        print('崗位職責(zé)：',Duty)

                        self.driver.close()   # 必須關(guān)閉當(dāng)前數(shù)據(jù)頁面，否則會占用大量資源，查詢數(shù)據(jù)量很大的時候會導(dǎo)致宕機。。。

                        sleep(0.5)

                        self.driver.switch_to.window (n[0])  # 切換回原網(wǎng)頁

                    else:

                        continue

                # 先判斷是否有分頁信息,每頁最多30條數(shù)據(jù)(30個li標簽)，少于30條數(shù)據(jù)表示沒有下一頁了

                if link_lens.__len__() < 30:

                    print('沒有下一頁了')

                    panduan = False

                else:

                    if panduan:  # 會出現(xiàn)有下一頁但是數(shù)據(jù)不是我們查詢的市的數(shù)據(jù)，已在上方進行了判斷(if adrs != ct:)

                        if ''.join(sou.xpath('//a[@ka="page-next"]/@href').extract()) == "javascript:;": # 網(wǎng)站最多顯示10頁數(shù)據(jù)，不做判斷會導(dǎo)致死循環(huán)

                            panduan = False

                        else:

                            next_page = self.driver.find_element_by_xpath('//a[@ka="page-next"]') # 翻頁按鈕

                            next_page.click() # 點擊翻頁

                            print('準備抓取下一頁')

                            sleep(random.randint(1,5)) # 考慮到封ip，適當(dāng)休眠

                    else:

                        break

            sleep(random.randint(5,15))

    self.driver.quit() # 程序運行結(jié)束，關(guān)閉瀏覽器進程

數(shù)據(jù)爬取完畢。

pipelines，sttings和item的代碼千篇一律，這里就不放上來了。

由于使用的是selenium，注定了爬取速度不會很快。

數(shù)據(jù)無價，且爬且珍惜。

向AI問一下細節(jié)

記錄抓取某直聘網(wǎng)站

-- coding: utf-8 --

boss > 各地口腔招聘

猜你喜歡

最新資訊

相關(guān)推薦

相關(guān)標簽