在Scrapy中處理多級頁面跳轉(zhuǎn)通??梢酝ㄟ^兩種方式來實現(xiàn):
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class MyCrawlSpider(CrawlSpider):
name = 'my_crawl_spider'
allowed_domains = ['example.com']
start_urls = ['http://www.example.com']
rules = (
Rule(LinkExtractor(allow='item'), callback='parse_item'),
)
def parse_item(self, response):
# 提取數(shù)據(jù)
pass
import scrapy
class MySpider(scrapy.Spider):
name = 'my_spider'
start_urls = ['http://www.example.com']
def parse(self, response):
# 提取數(shù)據(jù)
# 處理下一個頁面的跳轉(zhuǎn)
next_page_url = response.css('a.next_page::attr(href)').extract_first()
if next_page_url:
yield response.follow(next_page_url, callback=self.parse_next_page)
def parse_next_page(self, response):
# 提取數(shù)據(jù)
pass
使用以上兩種方法之一,你可以很方便地處理多級頁面跳轉(zhuǎn)并提取需要的數(shù)據(jù)。