您好,登錄后才能下訂單哦!
本文介紹了Scrapy項(xiàng)目實(shí)戰(zhàn)之爬取某社區(qū)用戶詳情,分享給大家,具有如下:
get_cookies.py
from selenium import webdriver from pymongo import MongoClient from scrapy.crawler import overridden_settings # from segmentfault import settings import time import settings class GetCookies(object): def __init__(self): # 初始化組件 # 設(shè)定webdriver選項(xiàng) self.opt = webdriver.ChromeOptions() # self.opt.add_argument("--headless") # 初始化用戶列表 self.user_list = settings.USER_LIST # 初始化MongoDB參數(shù) self.client = MongoClient(settings.MONGO_URI) self.db = self.client[settings.MONGO_DB] self.collection = self.db["cookies"] def get_cookies(self,username,password): """ :param username: :param password: :return: cookies """ # 使用webdriver選項(xiàng)創(chuàng)建driver driver = webdriver.Chrome(executable_path="/Users/Hank/scrapy/segmentfault/segmentfault/chromedriver",options=self.opt) driver.get("https://segmentfault.com/user/login") driver.find_element_by_name("username").send_keys(username) driver.find_element_by_name("password").send_keys(password) driver.find_element_by_xpath("http://button[@type='submit']").click() time.sleep(2) driver.get("https://segmentfault.com/u/luwangmeilun/users/following") # 登陸之后獲取頁(yè)面cookies cookies = driver.get_cookies() driver.quit() return cookies def format_cookies(self,cookies): """ :param cookies: 從driver.get_cookies的形式為: [{'domain': 'segmentfault.com', 'httpOnly': False, 'name': 'PHPSESSID', 'path': '/', 'secure': False, 'value': 'web2~5grmfa89j12eksub8hja3bvaq4'}, {'domain': '.segmentfault.com', 'expiry': 1581602940, 'httpOnly': False, 'name': 'Hm_lvt_e23800c454aa573c0ccb16b52665ac26', 'path': '/', 'secure': False, 'value': '1550066940'}, {'domain': '.segmentfault.com', 'httpOnly': False, 'name': 'Hm_lpvt_e23800c454aa573c0ccb16b52665ac26', 'path': '/', 'secure': False, 'value': '1550066940'}, {'domain': '.segmentfault.com', 'expiry': 1550067000, 'httpOnly': False, 'name': '_gat', 'path': '/', 'secure': False, 'value': '1'}, {'domain': '.segmentfault.com', 'expiry': 1550153340, 'httpOnly': False, 'name': '_gid', 'path': '/', 'secure': False, 'value': 'GA1.2.783265084.1550066940'}, {'domain': '.segmentfault.com', 'expiry': 1613138940, 'httpOnly': False, 'name': '_ga', 'path': '/', 'secure': False, 'value': 'GA1.2.1119166665.1550066940'}] 只需提取每一項(xiàng)的name與value即可 :return: """ c = dict() for item in cookies: c[item['name']] = item['value'] return c def save(self): print("開(kāi)始獲取Cookies....") # 從用戶列表中獲取用戶名與密碼,分別登陸獲取cookies for username,password in self.user_list: cookies = self.get_cookies(username,password) f_cookies = self.format_cookies(cookies) print("insert cookie:{}".format(f_cookies)) # 將格式整理后的cookies插入MongoDB數(shù)據(jù)庫(kù) self.collection.insert_one(f_cookies) # s = db[self.collection].find() # for i in s: # print(i) if __name__ == '__main__': cookies = GetCookies() for i in range(20): cookies.save()
item.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class SegmentfaultItem(scrapy.Item): # define the fields for your item here like: # 個(gè)人屬性 # 姓名 name = scrapy.Field() # 聲望 rank = scrapy.Field() # 學(xué)校 school = scrapy.Field() # 專業(yè) majors = scrapy.Field() # 公司 company = scrapy.Field() # 工作 job = scrapy.Field() # blog blog = scrapy.Field() # 社交活動(dòng)數(shù)據(jù) # 關(guān)注人數(shù) following = scrapy.Field() # 粉絲數(shù) fans = scrapy.Field() # 回答數(shù) answers = scrapy.Field() # 提問(wèn)數(shù) questions = scrapy.Field() # 文章數(shù) articles = scrapy.Field() # 講座數(shù) lives = scrapy.Field() # 徽章數(shù) badges = scrapy.Field() # 技能屬性 # 點(diǎn)贊數(shù) like = scrapy.Field() # 技能 skills = scrapy.Field() # 注冊(cè)日期 register_date = scrapy.Field() # 問(wèn)答統(tǒng)計(jì) # 回答最高得票數(shù) answers_top_score = scrapy.Field() # 得票數(shù)最高的回答對(duì)應(yīng)的問(wèn)題的標(biāo)題 answers_top_title = scrapy.Field() # 得票數(shù)最高的回答對(duì)應(yīng)的問(wèn)題的標(biāo)簽 answers_top_tags = scrapy.Field() # 得票數(shù)最高的回答對(duì)應(yīng)的問(wèn)題的內(nèi)容 answers_top_question = scrapy.Field() # 得票數(shù)最高的回答對(duì)應(yīng)的問(wèn)題的內(nèi)容 answers_top_content = scrapy.Field()
pipeline.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymongo class SegmentfaultPipeline(object): # 設(shè)定MongoDB集合名稱 collection_name = 'userinfo' def __init__(self,mongo_uri,mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db # 通過(guò)crawler獲取settings.py中設(shè)定的MongoDB連接信息 @classmethod def from_crawler(cls,crawler): return cls( mongo_uri = crawler.settings.get('MONGO_URI'), mongo_db = crawler.settings.get('MONGO_DB','segmentfault') ) # 當(dāng)爬蟲(chóng)啟動(dòng)時(shí)連接MongoDB def open_spider(self,spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] # 當(dāng)爬蟲(chóng)關(guān)閉時(shí)斷開(kāi)MongoDB連接 def close_spider(self,spider): self.client.close() # 將Item插入數(shù)據(jù)庫(kù)保存 def process_item(self, item, spider): self.db[self.collection_name].insert_one(dict(item)) return item
settings.py
# -*- coding: utf-8 -*- # Scrapy settings for segmentfault project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'segmentfault' SPIDER_MODULES = ['segmentfault.spiders'] NEWSPIDER_MODULE = 'segmentfault.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) CONCURRENT_REQUESTS = 100 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs # DOWNLOAD_DELAY = 2 # The download delay setting will honor only one of: # CONCURRENT_REQUESTS_PER_DOMAIN = 32 # CONCURRENT_REQUESTS_PER_IP = 32 # Disable cookies (enabled by default) # COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False RETRY_ENABLED = False REDIRECT_ENABLED = False DOWNLOAD_TIMEOUT = 5 # HTTPALLOW # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html SPIDER_MIDDLEWARES = { 'segmentfault.middlewares.SegmentfaultSpiderMiddleware': 543, } # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { # 'segmentfault.middlewares.SegmentfaultHttpProxyMiddleware': 543, 'segmentfault.middlewares.SegmentfaultUserAgentMiddleware':643, 'segmentfault.middlewares.SegmentfaultCookiesMiddleware':743, 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None, 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, # 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware':None, } # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'segmentfault.pipelines.SegmentfaultPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html # AUTOTHROTTLE_ENABLED = True # # The initial download delay # AUTOTHROTTLE_START_DELAY = 5 # # The maximum download delay to be set in case of high latencies # AUTOTHROTTLE_MAX_DELAY = 60 # # The average number of requests Scrapy should be sending in parallel to # # each remote server # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # # Enable showing throttling stats for every response received: # AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' # 配置MONGODB MONGO_URI = 'localhost:27017' MONGO_DB = 'segmentfault' # 用戶列表 USER_LIST = [ ("798549150@qq.com","guoqing1010"), ("learnscrapy@163.com","guoqing1010"), ] # 配置代理列表 PROXY_LIST = [ 'http://115.182.212.169:8080', 'http://121.61.25.149:9999', 'http://180.118.247.189:9000', 'http://115.151.3.12:9999', 'http://183.154.213.160:9000', 'http://113.128.9.106:9999', 'http://124.42.68.152:90', 'http://49.70.48.50:9999', 'http://113.128.11.172:9999', 'http://111.177.177.40:9999', 'http://59.62.83.253:9999', 'http://39.107.84.185:8123', 'http://124.94.195.107:9999', 'http://111.177.160.132:9999', 'http://120.25.203.182:7777' ] USER_AGENT_LIST = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60', 'Opera/8.0 (Windows NT 5.1; U; en)', 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', 'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36' ]
userinfo.py
# -*- coding: utf-8 -*- import scrapy import time from scrapy import Request from pymongo import MongoClient from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider,Rule from scrapy.http import FormRequest from segmentfault.items import SegmentfaultItem class UserinfoSpider(CrawlSpider): name = 'userinfo' allowed_domains = ['segmentfault.com'] start_urls = ['https://segmentfault.com/u/mybigbigcat/users/following'] rules = ( # 用戶主頁(yè)地址,跟進(jìn)并進(jìn)行解析 Rule(LinkExtractor(allow=r'/u/\w+$'),callback='parse_item',follow=True), # 用戶關(guān)注列表,跟進(jìn)列表頁(yè)面,抓取用戶主頁(yè)地址進(jìn)行后續(xù)操作 # Rule(LinkExtractor(allow=r'/users/followed$'),follow=True), # 用戶粉絲列表,跟進(jìn)列表頁(yè)面,抓取用戶主頁(yè)地址進(jìn)行后續(xù)操作 Rule(LinkExtractor(allow=r'/users/following$'),follow=True), # 跟進(jìn)其他頁(yè)面地址 # Rule(LinkExtractor(allow=r'/users/[followed|following]?page=\d+'),follow=True), ) def start_requests(self): # 從MongoDB中獲取一條cookie,添加到開(kāi)始方法 client = MongoClient(self.crawler.settings['MONGO_URI']) db = client[self.crawler.settings['MONGO_DB']] cookies_collection = db.cookies # 獲取一條cookie cookies = cookies_collection.find_one() # cookie中的'Hm_lpvt_e23800c454aa573c0ccb16b52665ac26'參數(shù)是當(dāng)前時(shí)間的10位表示法,因此重新填充 cookies['Hm_lpvt_e23800c454aa573c0ccb16b52665ac26'] = str(int(time.time())) return [Request("https://segmentfault.com", cookies=cookies, meta={'cookiejar':1}, callback=self.after_login)] # 登錄之后從start_url中開(kāi)始抓取數(shù)據(jù) def after_login(self,response): for url in self.start_urls: return self.make_requests_from_url(url) # def after_login(self,response): # yield Request(self.start_urls[0], # meta={'cookiejar':response.meta['cookiejar']}, # callback=self.parse_item) def parse_item(self, response): """ :param response: :return: """ item = SegmentfaultItem() # 個(gè)人屬性模塊 profile_head = response.css('.profile__heading') # 姓名 item['name'] = profile_head.css('h3[class*=name]::text').re_first(r'\w+') # 聲望 item['rank'] = profile_head.css('.profile__rank-btn > span::text').extract_first() # 學(xué)校專業(yè)信息 school_info = profile_head.css('.profile__school::text').extract() if school_info: # 學(xué)校 item['school'] = school_info[0] # 專業(yè) item['majors'] = school_info[1].strip() else: item['school'] = '' item['majors'] = '' # 公司職位信息 company_info = profile_head.css('.profile__company::text').extract() if company_info: # 公司 item['company'] = company_info[0] # 職位 item['job'] = company_info[1].strip() else: item['company'] = '' item['job'] = '' # 個(gè)人博客 item['blog'] = profile_head.css('a[class*=other-item-link]::attr(href)').extract_first() # 統(tǒng)計(jì)面板模塊 profile_active = response.xpath("http://div[@class='col-md-2']") # 關(guān)注人數(shù) item['following'] = profile_active.css('div[class*=info] a > .h6::text').re(r'\d+')[0] # 粉絲人數(shù) item['fans'] = profile_active.css('div[class*=info] a > .h6::text').re(r'\d+')[1] # 回答問(wèn)題數(shù) item['answers'] = profile_active.css('a[href*=answer] .count::text').re_first(r'\d+') # 提問(wèn)數(shù) item['questions'] = profile_active.css('a[href*=questions] .count::text').re_first(r'\d+') # 文章數(shù) item['articles'] = profile_active.css('a[href*=articles] .count::text').re_first(r'\d+') # 講座數(shù) item['lives'] = profile_active.css('a[href*=lives] .count::text').re_first(r'\d+') # 徽章數(shù) item['badges'] = profile_active.css('a[href*=badges] .count::text').re_first(r'\d+') # 徽章詳細(xì)頁(yè)面地址 badge_url = profile_active.css('a[href*=badges]::attr(href)').extract_first() # 技能面板模塊 profile_skill = response.xpath("http://div[@class='col-md-3']") # 技能標(biāo)簽列表 item['skills'] = profile_skill.css('.tag::text').re(r'\w+') # 獲得的點(diǎn)贊數(shù) item['like'] = profile_skill.css('.authlist').re_first(r'獲得 (\d+) 次點(diǎn)贊') # 注冊(cè)日期 item['register_date'] = profile_skill.css('.profile__skill--other p::text').extract_first() # if register_time: # item['register_date'] = ''.join(re.findall(r'\d+',register_time)) # else: # item['register_date'] = '' # 產(chǎn)出數(shù)據(jù)模塊 profile_work = response.xpath("http://div[@class='col-md-7']") # 回答獲得的最高分 item['answers_top_score'] = profile_work.css('#navAnswer .label::text').re_first(r'\d+') # 最高分回答對(duì)應(yīng)的問(wèn)題的標(biāo)題 item['answers_top_title'] = profile_work.css('#navAnswer div[class*=title-warp] > a::text').extract_first() # 最高分回答對(duì)應(yīng)的問(wèn)題的url answer_url = profile_work.css('#navAnswer div[class*=title-warp] > a::attr(href)').extract_first() # 將需要繼續(xù)跟進(jìn)抓取數(shù)據(jù)的url與item作為參數(shù)傳遞給相應(yīng)方法繼續(xù)抓取數(shù)據(jù) request = scrapy.Request( # 問(wèn)題詳細(xì)頁(yè)url url=response.urljoin(answer_url), meta={ # item需要傳遞 'item':item, # 徽章的url 'badge_url':response.urljoin(badge_url)}, # 調(diào)用parse_ansser繼續(xù)處理 callback=self.parse_answer) yield request def parse_answer(self,response): # 取出傳遞的item item = response.meta['item'] # 取出傳遞的徽章詳細(xì)頁(yè)url badge_url = response.meta['badge_url'] # 問(wèn)題標(biāo)簽列表 item['answers_top_tags'] = response.css('.question__title--tag .tag::text').re(r'\w+') # 先獲取組成問(wèn)題內(nèi)容的字符串列表 question_content = response.css('.widget-question__item p').re(r'>(.*?)<') # 拼接后傳入item item['answers_top_question'] = ''.join(question_content) # 先獲取組成答案的字符串列表 answer_content = response.css('.qa-answer > article .answer').re(r'>(.*?)<') # 拼接后傳入item item['answers_top_content'] = ''.join(answer_content) # 問(wèn)題頁(yè)面內(nèi)容抓取后繼續(xù)抓取徽章頁(yè)內(nèi)容,并將更新后的item繼續(xù)傳遞 request = scrapy.Request(url=badge_url, meta={'item':item}, callback=self.parse_badge) yield request def parse_badge(self,response): item = response.meta['item'] badge_name = response.css('span.badge span::text').extract() badge_count = response.css('span[class*=badges-count]::text').re(r'\d+') name_count = {} for i in range(len(badge_count)): name_count[badge_name[i]] = badge_count[i] item['badges'] = name_count yield item
middlewars.py
# -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://doc.scrapy.org/en/latest/topics/spider-middleware.html import random import re import datetime import scrapy import logging import time from scrapy.conf import settings from pymongo import MongoClient from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware import pymongo logger = logging.getLogger(__name__) class SegmentfaultSpiderMiddleware(object): """ 處理Item中保存的三種類型注冊(cè)日期數(shù)據(jù): 1. 注冊(cè)于 2015年12月12日 2. 注冊(cè)于 3 天前 3. 注冊(cè)于 5 小時(shí)前 """ def process_spider_output(self,response,result,spider): """ 輸出response時(shí)調(diào)用此方法處理item中register_date :param response: :param result: 包含item :param spider: :return:處理過(guò)注冊(cè)日期的item """ for item in result: # 判斷獲取的數(shù)據(jù)是否是scrapy.item類型 if isinstance(item,scrapy.Item): # 獲取當(dāng)前時(shí)間 now = datetime.datetime.now() register_date = item['register_date'] logger.info("獲取注冊(cè)日志格式為{}".format(register_date)) # 提取注冊(cè)日期字符串,如'注冊(cè)于2015年12月12日' => '20151212' day = ''.join(re.findall(r'\d+',register_date)) # 如果提取數(shù)字字符串長(zhǎng)度大于4位,則為'注冊(cè)于2015年12月12日'形式 if len(day) > 4: date = day # 如果‘時(shí)'在提取的字符串中,則為'注冊(cè)于8小時(shí)前'形式 elif '時(shí)' in register_date: d = now - datetime.timedelta(hours=int(day)) date = d.strftime("%Y%m%d") # 最后一種情況就是'注冊(cè)于3天前'形式 else: d = now - datetime.timedelta(days=int(day)) date = d.strftime("%Y%m%d") # 更新register_date值 item['register_date'] = date yield item class SegmentfaultHttpProxyMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. def __init__(self): self.proxy_list = settings['PROXY_LIST'] def process_request(self, request, spider): proxy = random.choice(self.proxy_list) logger.info('使用代理:{}'.format(proxy)) request.meta['proxy'] = proxy class SegmentfaultUserAgentMiddleware(object): def __init__(self): self.useragent_list = settings['USER_AGENT_LIST'] def process_request(self,request,spider): user_agent = random.choice(self.useragent_list) # logger.info('使用的USE USER-AGENT:{}'.format(user_agent)) request.headers['User-Agent'] = user_agent class SegmentfaultCookiesMiddleware(object): client = MongoClient(settings['MONGO_URI']) db = client[settings['MONGO_DB']] collection = db['cookies'] def get_cookies(self): """ 隨機(jī)獲取cookies :return: """ cookies = random.choice([cookie for cookie in self.collection.find()]) # 將不需要的"_id"與"_gat"參數(shù)刪除 cookies.pop('_id') cookies.pop('_gat') # 將"Hm_lpvt_e23800c454aa573c0ccb16b52665ac26"填充當(dāng)前時(shí)間 cookies['Hm_lpvt_e23800c454aa573c0ccb16b52665ac26'] = str(int(time.time())) return cookies def remove_cookies(self,cookies): """ 刪除已失效的cookies :param cookies: :return: """ # 隨機(jī)獲取cookies中的一對(duì)鍵值,返回結(jié)果是一個(gè)元祖 i = cookies.popitem() # 刪除cookies try: logger.info("刪除cookies{}".format(cookies)) self.collection.remove({i[0]:i[1]}) except Exception as e: logger.info("No this cookies:{}".format(cookies)) def process_request(self,request,spider): """ 為每一個(gè)request添加一個(gè)cookie :param request: :param spider: :return: """ cookies = self.get_cookies() request.cookies = cookies def process_response(self,request,response,spider): """ 對(duì)于登錄失效的情況,可能會(huì)重定向到登錄頁(yè)面,這時(shí)添加新的cookies繼續(xù),將請(qǐng)求放回調(diào)度器 :param request: :param response: :param spider: :return: """ if response.status in [301,302]: logger.info("Redirect response:{}".format(response)) redirect_url = response.headers['location'] if b'/user/login' in redirect_url: logger.info("Cookies失效") # 請(qǐng)求失敗,重新獲取一個(gè)cookie,添加到request,并停止后續(xù)中間件處理此request,將此request放入調(diào)度器 new_cookie = self.get_cookies() logger.info("獲取新cookie:{}".format(new_cookie)) # 刪除舊cookies self.remove_cookies(request.cookies) request.cookies = new_cookie return request # return response
run.py
from scrapy import cmdline # from segmentfault.get_cookies import GetCookies from get_cookies import GetCookies if __name__ == '__main__': cookies = GetCookies() cookies.save() name = 'userinfo' "" cmd = 'scrapy crawl {}'.format(name) cmdline.execute(cmd.split())
到此這篇關(guān)于Scrapy項(xiàng)目實(shí)戰(zhàn)之爬取某社區(qū)用戶詳情的文章就介紹到這了,更多相關(guān)Scrapy 爬取某社區(qū)用戶內(nèi)容請(qǐng)搜索億速云以前的文章或繼續(xù)瀏覽下面的相關(guān)文章希望大家以后多多支持億速云!
免責(zé)聲明:本站發(fā)布的內(nèi)容(圖片、視頻和文字)以原創(chuàng)、轉(zhuǎn)載和分享為主,文章觀點(diǎn)不代表本網(wǎng)站立場(chǎng),如果涉及侵權(quán)請(qǐng)聯(lián)系站長(zhǎng)郵箱:is@yisu.com進(jìn)行舉報(bào),并提供相關(guān)證據(jù),一經(jīng)查實(shí),將立刻刪除涉嫌侵權(quán)內(nèi)容。