Scrapy項(xiàng)目實(shí)戰(zhàn)之爬取某社區(qū)用戶詳情

發(fā)布時(shí)間：2020-10-13 09:35:41 來(lái)源：腳本之家閱讀：162 作者：hankleo 欄目：開(kāi)發(fā)技術(shù)

本文介紹了Scrapy項(xiàng)目實(shí)戰(zhàn)之爬取某社區(qū)用戶詳情，分享給大家，具有如下：

get_cookies.py

from selenium import webdriver
from pymongo import MongoClient
from scrapy.crawler import overridden_settings
# from segmentfault import settings
import time
import settings

class GetCookies(object):
 def __init__(self):
  # 初始化組件
  # 設(shè)定webdriver選項(xiàng)
  self.opt = webdriver.ChromeOptions()
  # self.opt.add_argument("--headless")
  # 初始化用戶列表
  self.user_list = settings.USER_LIST
  # 初始化MongoDB參數(shù)
  self.client = MongoClient(settings.MONGO_URI)
  self.db = self.client[settings.MONGO_DB]
  self.collection = self.db["cookies"]

 def get_cookies(self,username,password):
  """

  :param username:
  :param password:
  :return: cookies
  """
  # 使用webdriver選項(xiàng)創(chuàng)建driver
  driver = webdriver.Chrome(executable_path="/Users/Hank/scrapy/segmentfault/segmentfault/chromedriver",options=self.opt)
  driver.get("https://segmentfault.com/user/login")
  driver.find_element_by_name("username").send_keys(username)
  driver.find_element_by_name("password").send_keys(password)
  driver.find_element_by_xpath("http://button[@type='submit']").click()
  time.sleep(2)
  driver.get("https://segmentfault.com/u/luwangmeilun/users/following")
  # 登陸之后獲取頁(yè)面cookies
  cookies = driver.get_cookies()
  driver.quit()

  return cookies

 def format_cookies(self,cookies):
  """

  :param cookies:
  從driver.get_cookies的形式為：
  [{'domain': 'segmentfault.com', 'httpOnly': False, 'name': 'PHPSESSID',
  'path': '/', 'secure': False, 'value': 'web2~5grmfa89j12eksub8hja3bvaq4'},
  {'domain': '.segmentfault.com', 'expiry': 1581602940, 'httpOnly': False,
  'name': 'Hm_lvt_e23800c454aa573c0ccb16b52665ac26', 'path': '/', 'secure': False,
  'value': '1550066940'},
  {'domain': '.segmentfault.com', 'httpOnly': False,
  'name': 'Hm_lpvt_e23800c454aa573c0ccb16b52665ac26',
  'path': '/', 'secure': False, 'value': '1550066940'},
  {'domain': '.segmentfault.com', 'expiry': 1550067000, 'httpOnly': False,
  'name': '_gat', 'path': '/', 'secure': False, 'value': '1'},
  {'domain': '.segmentfault.com', 'expiry': 1550153340, 'httpOnly': False,
  'name': '_gid', 'path': '/', 'secure': False, 'value': 'GA1.2.783265084.1550066940'},
  {'domain': '.segmentfault.com', 'expiry': 1613138940, 'httpOnly': False, 'name': '_ga',
  'path': '/', 'secure': False, 'value': 'GA1.2.1119166665.1550066940'}]
  只需提取每一項(xiàng)的name與value即可

  :return:
  """
  c = dict()
  for item in cookies:
   c[item['name']] = item['value']

  return c

 def save(self):
  print("開(kāi)始獲取Cookies....")
  # 從用戶列表中獲取用戶名與密碼，分別登陸獲取cookies
  for username,password in self.user_list:
   cookies = self.get_cookies(username,password)
   f_cookies = self.format_cookies(cookies)
   print("insert cookie:{}".format(f_cookies))
   # 將格式整理后的cookies插入MongoDB數(shù)據(jù)庫(kù)
   self.collection.insert_one(f_cookies)

  # s = db[self.collection].find()
  # for i in s:
  #  print(i)


if __name__ == '__main__':

 cookies = GetCookies()
 for i in range(20):
  cookies.save()

item.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class SegmentfaultItem(scrapy.Item):
 # define the fields for your item here like:
 # 個(gè)人屬性
 # 姓名
 name = scrapy.Field()
 # 聲望
 rank = scrapy.Field()
 # 學(xué)校
 school = scrapy.Field()
 # 專業(yè)
 majors = scrapy.Field()
 # 公司
 company = scrapy.Field()
 # 工作
 job = scrapy.Field()
 # blog
 blog = scrapy.Field()
 # 社交活動(dòng)數(shù)據(jù)
 # 關(guān)注人數(shù)
 following = scrapy.Field()
 # 粉絲數(shù)
 fans = scrapy.Field()
 # 回答數(shù)
 answers = scrapy.Field()
 # 提問(wèn)數(shù)
 questions = scrapy.Field()
 # 文章數(shù)
 articles = scrapy.Field()
 # 講座數(shù)
 lives = scrapy.Field()
 # 徽章數(shù)
 badges = scrapy.Field()
 # 技能屬性
 # 點(diǎn)贊數(shù)
 like = scrapy.Field()
 # 技能
 skills = scrapy.Field()
 # 注冊(cè)日期
 register_date = scrapy.Field()
 # 問(wèn)答統(tǒng)計(jì)
 # 回答最高得票數(shù)
 answers_top_score = scrapy.Field()
 # 得票數(shù)最高的回答對(duì)應(yīng)的問(wèn)題的標(biāo)題
 answers_top_title = scrapy.Field()
 # 得票數(shù)最高的回答對(duì)應(yīng)的問(wèn)題的標(biāo)簽
 answers_top_tags = scrapy.Field()
 # 得票數(shù)最高的回答對(duì)應(yīng)的問(wèn)題的內(nèi)容
 answers_top_question = scrapy.Field()
 # 得票數(shù)最高的回答對(duì)應(yīng)的問(wèn)題的內(nèi)容
 answers_top_content = scrapy.Field()

pipeline.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo

class SegmentfaultPipeline(object):
 # 設(shè)定MongoDB集合名稱
 collection_name = 'userinfo'

 def __init__(self,mongo_uri,mongo_db):
  self.mongo_uri = mongo_uri
  self.mongo_db = mongo_db

 # 通過(guò)crawler獲取settings.py中設(shè)定的MongoDB連接信息
 @classmethod
 def from_crawler(cls,crawler):
  return cls(
   mongo_uri = crawler.settings.get('MONGO_URI'),
   mongo_db = crawler.settings.get('MONGO_DB','segmentfault')
  )

 # 當(dāng)爬蟲(chóng)啟動(dòng)時(shí)連接MongoDB
 def open_spider(self,spider):
  self.client = pymongo.MongoClient(self.mongo_uri)
  self.db = self.client[self.mongo_db]

 # 當(dāng)爬蟲(chóng)關(guān)閉時(shí)斷開(kāi)MongoDB連接
 def close_spider(self,spider):
  self.client.close()

 # 將Item插入數(shù)據(jù)庫(kù)保存
 def process_item(self, item, spider):
  self.db[self.collection_name].insert_one(dict(item))
  return item

settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for segmentfault project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#  https://doc.scrapy.org/en/latest/topics/settings.html
#  https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#  https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'segmentfault'

SPIDER_MODULES = ['segmentfault.spiders']
NEWSPIDER_MODULE = 'segmentfault.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 100

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 2
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 32
# CONCURRENT_REQUESTS_PER_IP = 32

# Disable cookies (enabled by default)
# COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

RETRY_ENABLED = False

REDIRECT_ENABLED = False

DOWNLOAD_TIMEOUT = 5

# HTTPALLOW

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}


# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
 'segmentfault.middlewares.SegmentfaultSpiderMiddleware': 543,
}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
 # 'segmentfault.middlewares.SegmentfaultHttpProxyMiddleware': 543,
 'segmentfault.middlewares.SegmentfaultUserAgentMiddleware':643,
 'segmentfault.middlewares.SegmentfaultCookiesMiddleware':743,
 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None,
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
 # 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware':None,

}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
 'segmentfault.pipelines.SegmentfaultPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# # The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# # The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# # The average number of requests Scrapy should be sending in parallel to
# # each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# # Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

# 配置MONGODB
MONGO_URI = 'localhost:27017'
MONGO_DB = 'segmentfault'

# 用戶列表
USER_LIST = [
 ("798549150@qq.com","guoqing1010"),
 ("learnscrapy@163.com","guoqing1010"),
]

# 配置代理列表
PROXY_LIST = [
 'http://115.182.212.169:8080',
 'http://121.61.25.149:9999',
 'http://180.118.247.189:9000',
 'http://115.151.3.12:9999',
 'http://183.154.213.160:9000',
 'http://113.128.9.106:9999',
 'http://124.42.68.152:90',
 'http://49.70.48.50:9999',
 'http://113.128.11.172:9999',
 'http://111.177.177.40:9999',
 'http://59.62.83.253:9999',
 'http://39.107.84.185:8123',
 'http://124.94.195.107:9999',
 'http://111.177.160.132:9999',
 'http://120.25.203.182:7777'
]

USER_AGENT_LIST = [
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
 'Opera/8.0 (Windows NT 5.1; U; en)',
 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
 'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)',
 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)',
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36'
]

userinfo.py

# -*- coding: utf-8 -*-
import scrapy
import time
from scrapy import Request
from pymongo import MongoClient
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider,Rule
from scrapy.http import FormRequest
from segmentfault.items import SegmentfaultItem


class UserinfoSpider(CrawlSpider):
 name = 'userinfo'
 allowed_domains = ['segmentfault.com']
 start_urls = ['https://segmentfault.com/u/mybigbigcat/users/following']

 rules = (
  # 用戶主頁(yè)地址，跟進(jìn)并進(jìn)行解析
  Rule(LinkExtractor(allow=r'/u/\w+$'),callback='parse_item',follow=True),
  # 用戶關(guān)注列表，跟進(jìn)列表頁(yè)面，抓取用戶主頁(yè)地址進(jìn)行后續(xù)操作
  # Rule(LinkExtractor(allow=r'/users/followed$'),follow=True),
  # 用戶粉絲列表，跟進(jìn)列表頁(yè)面，抓取用戶主頁(yè)地址進(jìn)行后續(xù)操作
  Rule(LinkExtractor(allow=r'/users/following$'),follow=True),
  # 跟進(jìn)其他頁(yè)面地址
  # Rule(LinkExtractor(allow=r'/users/[followed|following]?page=\d+'),follow=True),
 )

 def start_requests(self):
  # 從MongoDB中獲取一條cookie，添加到開(kāi)始方法
  client = MongoClient(self.crawler.settings['MONGO_URI'])
  db = client[self.crawler.settings['MONGO_DB']]
  cookies_collection = db.cookies
  # 獲取一條cookie
  cookies = cookies_collection.find_one()
  # cookie中的'Hm_lpvt_e23800c454aa573c0ccb16b52665ac26'參數(shù)是當(dāng)前時(shí)間的10位表示法，因此重新填充
  cookies['Hm_lpvt_e23800c454aa573c0ccb16b52665ac26'] = str(int(time.time()))

  return [Request("https://segmentfault.com",
      cookies=cookies,
      meta={'cookiejar':1},
      callback=self.after_login)]

 # 登錄之后從start_url中開(kāi)始抓取數(shù)據(jù)
 def after_login(self,response):
  for url in self.start_urls:
   return self.make_requests_from_url(url)
 # def after_login(self,response):
 #  yield Request(self.start_urls[0],
 #     meta={'cookiejar':response.meta['cookiejar']},
 #     callback=self.parse_item)

 def parse_item(self, response):
  """
  :param response:
  :return:
  """
  item = SegmentfaultItem()
  # 個(gè)人屬性模塊
  profile_head = response.css('.profile__heading')
  # 姓名
  item['name'] = profile_head.css('h3[class*=name]::text').re_first(r'\w+')
  # 聲望
  item['rank'] = profile_head.css('.profile__rank-btn > span::text').extract_first()
  # 學(xué)校專業(yè)信息
  school_info = profile_head.css('.profile__school::text').extract()
  if school_info:
   # 學(xué)校
   item['school'] = school_info[0]
   # 專業(yè)
   item['majors'] = school_info[1].strip()
  else:
   item['school'] = ''
   item['majors'] = ''
  # 公司職位信息
  company_info = profile_head.css('.profile__company::text').extract()
  if company_info:
   # 公司
   item['company'] = company_info[0]
   # 職位
   item['job'] = company_info[1].strip()
  else:
   item['company'] = ''
   item['job'] = ''
  # 個(gè)人博客
  item['blog'] = profile_head.css('a[class*=other-item-link]::attr(href)').extract_first()

  # 統(tǒng)計(jì)面板模塊
  profile_active = response.xpath("http://div[@class='col-md-2']")
  # 關(guān)注人數(shù)
  item['following'] = profile_active.css('div[class*=info] a > .h6::text').re(r'\d+')[0]
  # 粉絲人數(shù)
  item['fans'] = profile_active.css('div[class*=info] a > .h6::text').re(r'\d+')[1]
  # 回答問(wèn)題數(shù)
  item['answers'] = profile_active.css('a[href*=answer] .count::text').re_first(r'\d+')
  # 提問(wèn)數(shù)
  item['questions'] = profile_active.css('a[href*=questions] .count::text').re_first(r'\d+')
  # 文章數(shù)
  item['articles'] = profile_active.css('a[href*=articles] .count::text').re_first(r'\d+')
  # 講座數(shù)
  item['lives'] = profile_active.css('a[href*=lives] .count::text').re_first(r'\d+')
  # 徽章數(shù)
  item['badges'] = profile_active.css('a[href*=badges] .count::text').re_first(r'\d+')
  # 徽章詳細(xì)頁(yè)面地址
  badge_url = profile_active.css('a[href*=badges]::attr(href)').extract_first()

  # 技能面板模塊
  profile_skill = response.xpath("http://div[@class='col-md-3']")
  # 技能標(biāo)簽列表
  item['skills'] = profile_skill.css('.tag::text').re(r'\w+')
  # 獲得的點(diǎn)贊數(shù)
  item['like'] = profile_skill.css('.authlist').re_first(r'獲得 (\d+) 次點(diǎn)贊')
  # 注冊(cè)日期
  item['register_date'] = profile_skill.css('.profile__skill--other p::text').extract_first()
  # if register_time:
  #  item['register_date'] = ''.join(re.findall(r'\d+',register_time))
  # else:
  #  item['register_date'] = ''

  # 產(chǎn)出數(shù)據(jù)模塊
  profile_work = response.xpath("http://div[@class='col-md-7']")
  # 回答獲得的最高分
  item['answers_top_score'] = profile_work.css('#navAnswer .label::text').re_first(r'\d+')
  # 最高分回答對(duì)應(yīng)的問(wèn)題的標(biāo)題
  item['answers_top_title'] = profile_work.css('#navAnswer div[class*=title-warp] > a::text').extract_first()
  # 最高分回答對(duì)應(yīng)的問(wèn)題的url
  answer_url = profile_work.css('#navAnswer div[class*=title-warp] > a::attr(href)').extract_first()

  # 將需要繼續(xù)跟進(jìn)抓取數(shù)據(jù)的url與item作為參數(shù)傳遞給相應(yīng)方法繼續(xù)抓取數(shù)據(jù)
  request = scrapy.Request(
   # 問(wèn)題詳細(xì)頁(yè)url
   url=response.urljoin(answer_url),
   meta={
   # item需要傳遞
   'item':item,
   # 徽章的url
   'badge_url':response.urljoin(badge_url)},
   # 調(diào)用parse_ansser繼續(xù)處理
   callback=self.parse_answer)
  yield request

 def parse_answer(self,response):
  # 取出傳遞的item
  item = response.meta['item']
  # 取出傳遞的徽章詳細(xì)頁(yè)url
  badge_url = response.meta['badge_url']
  # 問(wèn)題標(biāo)簽列表
  item['answers_top_tags'] = response.css('.question__title--tag .tag::text').re(r'\w+')
  # 先獲取組成問(wèn)題內(nèi)容的字符串列表
  question_content = response.css('.widget-question__item p').re(r'>(.*?)<')
  # 拼接后傳入item
  item['answers_top_question'] = ''.join(question_content)
  # 先獲取組成答案的字符串列表
  answer_content = response.css('.qa-answer > article .answer').re(r'>(.*?)<')
  # 拼接后傳入item
  item['answers_top_content'] = ''.join(answer_content)

  # 問(wèn)題頁(yè)面內(nèi)容抓取后繼續(xù)抓取徽章頁(yè)內(nèi)容，并將更新后的item繼續(xù)傳遞
  request = scrapy.Request(url=badge_url,
         meta={'item':item},
         callback=self.parse_badge)
  yield request

 def parse_badge(self,response):
  item = response.meta['item']
  badge_name = response.css('span.badge span::text').extract()
  badge_count = response.css('span[class*=badges-count]::text').re(r'\d+')
  name_count = {}
  for i in range(len(badge_count)):
   name_count[badge_name[i]] = badge_count[i]
  item['badges'] = name_count
  yield item

middlewars.py

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import random
import re
import datetime
import scrapy
import logging
import time
from scrapy.conf import settings
from pymongo import MongoClient
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
import pymongo
logger = logging.getLogger(__name__)


class SegmentfaultSpiderMiddleware(object):
 """
 處理Item中保存的三種類型注冊(cè)日期數(shù)據(jù)：
 1. 注冊(cè)于 2015年12月12日
 2. 注冊(cè)于 3 天前
 3. 注冊(cè)于 5 小時(shí)前
 """

 def process_spider_output(self,response,result,spider):

  """
  輸出response時(shí)調(diào)用此方法處理item中register_date
  :param response:
  :param result: 包含item
  :param spider:
  :return:處理過(guò)注冊(cè)日期的item
  """
  for item in result:
   # 判斷獲取的數(shù)據(jù)是否是scrapy.item類型
   if isinstance(item,scrapy.Item):
    # 獲取當(dāng)前時(shí)間
    now = datetime.datetime.now()
    register_date = item['register_date']
    logger.info("獲取注冊(cè)日志格式為{}".format(register_date))
    # 提取注冊(cè)日期字符串，如'注冊(cè)于2015年12月12日' => '20151212'
    day = ''.join(re.findall(r'\d+',register_date))
    # 如果提取數(shù)字字符串長(zhǎng)度大于4位，則為'注冊(cè)于2015年12月12日'形式
    if len(day) > 4:
     date = day
    # 如果‘時(shí)'在提取的字符串中，則為'注冊(cè)于8小時(shí)前'形式
    elif '時(shí)' in register_date:
     d = now - datetime.timedelta(hours=int(day))
     date = d.strftime("%Y%m%d")
    # 最后一種情況就是'注冊(cè)于3天前'形式
    else:
     d = now - datetime.timedelta(days=int(day))
     date = d.strftime("%Y%m%d")

    # 更新register_date值
    item['register_date'] = date
   yield item


class SegmentfaultHttpProxyMiddleware(object):
 # Not all methods need to be defined. If a method is not defined,
 # scrapy acts as if the downloader middleware does not modify the
 # passed objects.
 def __init__(self):
  self.proxy_list = settings['PROXY_LIST']

 def process_request(self, request, spider):
  proxy = random.choice(self.proxy_list)
  logger.info('使用代理:{}'.format(proxy))
  request.meta['proxy'] = proxy


class SegmentfaultUserAgentMiddleware(object):
 def __init__(self):
  self.useragent_list = settings['USER_AGENT_LIST']

 def process_request(self,request,spider):
  user_agent = random.choice(self.useragent_list)

  # logger.info('使用的USE USER-AGENT:{}'.format(user_agent))
  request.headers['User-Agent'] = user_agent



class SegmentfaultCookiesMiddleware(object):
 client = MongoClient(settings['MONGO_URI'])
 db = client[settings['MONGO_DB']]
 collection = db['cookies']

 def get_cookies(self):
  """
  隨機(jī)獲取cookies
  :return:
  """
  cookies = random.choice([cookie for cookie in self.collection.find()])
  # 將不需要的"_id"與"_gat"參數(shù)刪除
  cookies.pop('_id')
  cookies.pop('_gat')
  # 將"Hm_lpvt_e23800c454aa573c0ccb16b52665ac26"填充當(dāng)前時(shí)間
  cookies['Hm_lpvt_e23800c454aa573c0ccb16b52665ac26'] = str(int(time.time()))
  return cookies

 def remove_cookies(self,cookies):
  """
  刪除已失效的cookies
  :param cookies:
  :return:
  """
  # 隨機(jī)獲取cookies中的一對(duì)鍵值,返回結(jié)果是一個(gè)元祖
  i = cookies.popitem()
  # 刪除cookies
  try:
   logger.info("刪除cookies{}".format(cookies))
   self.collection.remove({i[0]:i[1]})
  except Exception as e:
   logger.info("No this cookies:{}".format(cookies))

 def process_request(self,request,spider):
  """
  為每一個(gè)request添加一個(gè)cookie
  :param request:
  :param spider:
  :return:
  """
  cookies = self.get_cookies()
  request.cookies = cookies

 def process_response(self,request,response,spider):
  """
  對(duì)于登錄失效的情況，可能會(huì)重定向到登錄頁(yè)面，這時(shí)添加新的cookies繼續(xù)，將請(qǐng)求放回調(diào)度器
  :param request:
  :param response:
  :param spider:
  :return:
  """
  if response.status in [301,302]:
   logger.info("Redirect response:{}".format(response))
   redirect_url = response.headers['location']
   if b'/user/login' in redirect_url:
    logger.info("Cookies失效")

    # 請(qǐng)求失敗，重新獲取一個(gè)cookie，添加到request，并停止后續(xù)中間件處理此request，將此request放入調(diào)度器
    new_cookie = self.get_cookies()
    logger.info("獲取新cookie:{}".format(new_cookie))
    # 刪除舊cookies
    self.remove_cookies(request.cookies)
    request.cookies = new_cookie
   return request
  #
  return response

run.py

from scrapy import cmdline
# from segmentfault.get_cookies import GetCookies
from get_cookies import GetCookies

if __name__ == '__main__':
 cookies = GetCookies()
 cookies.save()
 name = 'userinfo'
 ""
 cmd = 'scrapy crawl {}'.format(name)
 cmdline.execute(cmd.split())

到此這篇關(guān)于Scrapy項(xiàng)目實(shí)戰(zhàn)之爬取某社區(qū)用戶詳情的文章就介紹到這了,更多相關(guān)Scrapy 爬取某社區(qū)用戶內(nèi)容請(qǐng)搜索億速云以前的文章或繼續(xù)瀏覽下面的相關(guān)文章希望大家以后多多支持億速云！

向AI問(wèn)一下細(xì)節(jié)

Scrapy項(xiàng)目實(shí)戰(zhàn)之爬取某社區(qū)用戶詳情

猜你喜歡

最新資訊

相關(guān)推薦

相關(guān)標(biāo)簽