溫馨提示×

您好,登錄后才能下訂單哦!

密碼登錄×
登錄注冊(cè)×
其他方式登錄
點(diǎn)擊 登錄注冊(cè) 即表示同意《億速云用戶服務(wù)條款》

Python BeautifulSoup 爬取筆趣閣所有的小說

發(fā)布時(shí)間:2020-07-20 20:09:55 來源:網(wǎng)絡(luò) 閱讀:1218 作者:大吃小鯨魚 欄目:編程語言

這是一個(gè)練習(xí)作品。用python腳本爬取筆趣閣上面的免費(fèi)小說。

環(huán)境:python3
類庫:BeautifulSoup
數(shù)據(jù)源:http://www.biqukan.cc

原理就是偽裝正常http請(qǐng)求,正常訪問網(wǎng)頁。然后通過bs4重新解析html結(jié)構(gòu)來提取有效數(shù)據(jù)。

1. config文件

包含了偽裝請(qǐng)求頭部,數(shù)據(jù)源配置(如果不考慮擴(kuò)展其他數(shù)據(jù)源,可以寫死)。

#!/usr/bin/python
#coding:utf-8
import sys
reload(sys)
sys.setdefaultencoding('utf8')

source = {
    'biquge': {
        'base_url': 'http://www.biqukan.cc',
        'category_min': 1,
        'category_max': 2,
        'category_url': 'http://www.biqukan.cc/fenlei{id}/1.html'
    }
}

header = [
    {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36'},
    {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},
    {'User-Agent':'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)'},
    {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},
    {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)'}
]

config.py文件

2. 爬取全部類目小說

#!/usr/bin/python
#coding:utf-8

import random
import requests
import re
from bs4 import BeautifulSoup
#from BeautifulSoup import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from config import source
from config import header
import hashlib
import time

hash_md5 = hashlib.md5()

## 
# 通過分類獲取文章名和對(duì)應(yīng)的鏈接
# 
def fiction():
    url = source['biquge']['category_url']
    cur_category_name = ''
    _list = {}
    for i in range(source['biquge']['category_min'], source['biquge']['category_max']):
        req = requests.get(url.replace('{id}', '%s'%i), headers = header[random.randint(0,4)])
        _temp_result = req.content.decode('gbk')
        bs = BeautifulSoup(_temp_result, "html.parser")

        next_page = bs.find('ul', id='pagelink')
        while next_page!=None:
            next_page = next_page.find('a', 'next')
            if next_page==None:
                break

            # 更新小說
            _page = _cur_page(bs)
            print('page.length = %d'%len(_page))
            _list.update(_page)

            # 獲取下一頁數(shù)據(jù)
            req = requests.get(next_page.attrs['href'], headers = header[random.randint(0,4)])
            _temp_result = req.content.decode('gbk')
            bs = BeautifulSoup(_temp_result, "html.parser")
            next_page = bs.find('ul', id='pagelink')

            # 短暫休息一下
            time.sleep(random.random())

    return _list

## 
# 當(dāng)前頁面的所有小說信息
# 
def _cur_page(bs):
    _list = {}
    # top列表
    li_tags = bs.findAll('li', 'list-group-item')
    if li_tags==None or len(li_tags)<=0:
        return _list

    for item in li_tags:
        a_tag = item.find('a')
        _item = {'name':a_tag.get_text(), 'link': a_tag.attrs['href']}

        # 作者
        author = item.find('small').get_text().replace('/ ', '')
        _item['author'] = author

        # 閱讀數(shù)
        readers = item.find('span').get_text()
        _item['readers'] = readers

        hash_md5.update(_item['link'])
        _list[hash_md5.hexdigest()] = _item

    # 最近更新列表
    tr_tags = bs.findAll('tr')
    if tr_tags==None or len(tr_tags)<=1:
        return _list

    for item in tr_tags:
        a_tag = item.find('a')
        if a_tag==None:
            continue

        _item = {'name':a_tag.get_text(), 'link': a_tag.attrs['href']}

        # 作者
        author = item.find('td', 'text-muted').get_text()
        _item['author'] = author

        # 狀態(tài)
        status = item.findAll('td')
        _item['status'] = status[len(status)-1].get_text()

        hash_md5.update(_item['link'])
        if _list.has_key(hash_md5.hexdigest())!=True:
            _list[hash_md5.hexdigest()] = _item
        else:
            _list[hash_md5.hexdigest()]['status'] = _item['status']

    return _list

if __name__=="__main__": 
    _temp = fiction()
    print('done')

fiction.py文件

3. 爬取小說介紹信息

#!/usr/bin/python
#coding:utf-8

import random
import requests
import re
from bs4 import BeautifulSoup
#from BeautifulSoup import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from config import source
from config import header

##
# 抓取簡(jiǎn)介,返回結(jié)構(gòu)體
# 標(biāo)題:知否?知否?應(yīng)是綠肥紅瘦
# 作者:關(guān)心則亂
# 分類:都市言情
# 字?jǐn)?shù):200萬
# 閱讀數(shù):2w+
# 狀態(tài):連載中、完結(jié)
# 封面:url
# 簡(jiǎn)介:趙麗穎、馮紹峰主演電視劇《知否?知否?應(yīng)是綠肥紅瘦》原著,該劇由正午陽光影業(yè)出品,侯鴻亮任制片人,2017年9月6日開機(jī)。宅斗翹楚、古言大神關(guān)心則亂,手把手傳授你實(shí)用的古代生存指南。一個(gè)消極怠工的古代庶女,生活如此艱難,何必賣力奮斗。古代貴族女子的人生基調(diào)是由家族決定的,還流行株連,一個(gè)飛來橫禍就會(huì)徹底遭殃,要活好活順活出尊嚴(yán),明蘭表示,鴨梨很大。古代太危險(xiǎn)了,咱們還是睡死算了。
# 
def summary(url):
    _result={'title':'', 'author':'', 'category':'', 'words':'', 'readers':'', 'status':'', 'cover_img':'', 'summary':''}

    req = requests.get(url,headers = header[random.randint(0,4)])
    _temp_result = req.content.decode('gbk')
    bs = BeautifulSoup(_temp_result, "html.parser")

    title_tag = bs.find('h2', 'bookTitle')
    if title_tag!=None:
        _result['title'] = title_tag.get_text()

    book_tag = bs.find('p', 'booktag')
    if book_tag!=None:
        a_tags = book_tag.findAll('a')
        _result['author'] = a_tags[0].get_text()
        _result['category'] = a_tags[1].get_text()

        span_tags = book_tag.findAll('span')
        _result['words'] = span_tags[0].get_text()
        _result['readers'] = span_tags[1].get_text()
        _result['status'] = span_tags[2].get_text()

    intro_tag = bs.find('p', id='bookIntro')
    _result['cover_img'] = intro_tag.find('img').attrs['src']
    _result['summary'] = intro_tag.get_text().replace('\n\r\n                            ','').replace('\r\n                        ','')

    return _result

if __name__=="__main__": 
    _temp = summary('http://www.biqukan.cc/book/47583/')
    print(_temp)

summary.py文件

4. 爬取小說目錄

#!/usr/bin/python
#coding:utf-8

import random
import requests
import re
from bs4 import BeautifulSoup
#from BeautifulSoup import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from config import source
from config import header

# 抓取目錄
def catalog(url):
    _list=[]
    req = requests.get(url,headers = header[random.randint(0,4)])
    _temp_result = req.content.decode('gbk')
    bs = BeautifulSoup(_temp_result, "html.parser")

    all_list = bs.find('div', id='list-chapterAll')
    if all_list==None:
        return _list

    list_tag = all_list.find('dl', 'panel-chapterlist')
    if list_tag==None:
        return _list

    a_tags = list_tag.findAll('a')
    for k in a_tags:
        _dict={}
        _dict['name'] = k.get_text()
        _dict['link'] = url + k.attrs['href']
        _list.append(_dict)

    return _list

if __name__=="__main__": 
    _temp = catalog('http://www.biqukan.cc/book/47583/')
    print(_temp)

catalog.py文件

5. 爬取小說正文

#!/usr/bin/python
#coding:utf-8

import random
import requests
import re
from bs4 import BeautifulSoup
#from BeautifulSoup import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from config import source
from config import header

##
# 抓取小說正文
#
def detail(url):
    per_artitle_limit_page = 3;
    title=''
    content=''
    for i in range(1, per_artitle_limit_page):
        if i==1:
            part_url = ''
        else:
            part_url = '_%s'%i

        req = requests.get(url.replace('.html',part_url + '.html'),headers = header[random.randint(0,4)])
        _temp_result = req.content.decode('gbk')
        bs = BeautifulSoup(_temp_result, "html.parser")

        # title
        if len(title)<=0:
            title = bs.find('li','active').get_text()#re.findall(title_re, _temp_result)[0]

        content_tag = bs.find('div', id='htmlContent')
        if content_tag==None:
            break

        next_tag = content_tag.find('p', 'text-danger')
        if next_tag!=None:
            next_tag.clear()
        _ = content_tag.get_text().replace('-->>', '').replace('一秒記住【筆趣閣 www.biqukan.cc】,更新快,無彈窗,免費(fèi)讀!','')
        content += _
    return content

def filter(content):
    _temp = content.split('\r\n')
    for index in range(len(_temp)):
        _temp[index] = _temp[index].replace(' ','')
    _temp = [elem for elem in _temp if elem != None and len(elem) != 0]
    return ''.join(_temp)

if __name__=="__main__": 
    _temp = detail('http://www.biqukan.cc/book/20461/12592815.html')
    print(filter(_temp))

article.py文件

總結(jié)

暫沒有做數(shù)據(jù)保存模塊。如果需要串起來做成一個(gè)完整的項(xiàng)目的話,只需要把小說數(shù)據(jù)結(jié)構(gòu)保存即可(節(jié)省磁盤空間)。通過小說url可以很快速的提取出小說簡(jiǎn)介、目錄、每一章的正文。

如果想要做的更好,可以把目錄,介紹、正文等部分緩存起來,當(dāng)然得有足夠的空間。

向AI問一下細(xì)節(jié)

免責(zé)聲明:本站發(fā)布的內(nèi)容(圖片、視頻和文字)以原創(chuàng)、轉(zhuǎn)載和分享為主,文章觀點(diǎn)不代表本網(wǎng)站立場(chǎng),如果涉及侵權(quán)請(qǐng)聯(lián)系站長(zhǎng)郵箱:is@yisu.com進(jìn)行舉報(bào),并提供相關(guān)證據(jù),一經(jīng)查實(shí),將立刻刪除涉嫌侵權(quán)內(nèi)容。

AI