Python爬蟲應(yīng)用視頻課程——筆記

發(fā)布時(shí)間：2020-04-07 13:06:44 來(lái)源：網(wǎng)絡(luò) 閱讀：1587 作者：湯小洋欄目：編程語(yǔ)言

視頻課程鏈接：http://edu.51cto.com/course/14870.html
Python爬蟲應(yīng)用視頻課程——筆記

爬蟲，主講：湯小洋

一、爬蟲簡(jiǎn)介

1. 爬蟲是什么？

? 爬蟲，稱為網(wǎng)頁(yè)蜘蛛或網(wǎng)絡(luò)機(jī)器人，用于自動(dòng)獲（爬）取互聯(lián)網(wǎng)上的信息，本質(zhì)上就是一段代碼

? 任何一門高級(jí)開發(fā)語(yǔ)言都可以實(shí)現(xiàn)爬蟲，并不只有Python

2. 實(shí)現(xiàn)原理

? 通過代碼，模擬瀏覽器向服務(wù)器發(fā)送HTTP或HTTPS請(qǐng)求，然后對(duì)服務(wù)器響應(yīng)的結(jié)果進(jìn)行處理，從中獲取想要的數(shù)據(jù)

? 三步走：

獲取數(shù)據(jù)：發(fā)送請(qǐng)求并接收響應(yīng)結(jié)果
處理數(shù)據(jù)：對(duì)響應(yīng)結(jié)果進(jìn)行處理，篩選出有效數(shù)據(jù)
存儲(chǔ)數(shù)據(jù)：將有效數(shù)據(jù)存儲(chǔ)起來(lái)

二、基本用法

1. 獲取數(shù)據(jù)

? 使用urllib模塊模擬瀏覽器發(fā)送請(qǐng)求

# 獲取數(shù)據(jù)
def get_data():
    url = 'https://search.51job.com/list/070200,000000,0000,00,9,99,java%25E5%25BC%2580%25E5%258F%2591,2,1.html'
    # 創(chuàng)建Request對(duì)象，指定url和請(qǐng)求頭
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
    }
    req = request.Request(url, headers=headers)
    response = request.urlopen(req)
    # print(type(response))  # HTTPResponse類型
    # print(response.getcode())  # 響應(yīng)狀態(tài)碼
    # print(response.info())

    if response.getcode() == 200:
        data = response.read()  # 讀取響應(yīng)結(jié)果
        # print(type(data)) # bytes類型
        data = str(data, encoding='gbk')  # 轉(zhuǎn)換為str
        # print(data)

        # 將數(shù)據(jù)寫入文件中
        with open('index.html', mode='w', encoding='gbk') as f:
            f.write(data)

2. 處理數(shù)據(jù)

? 三種方式：

字符串解析

使用字符串+正則表達(dá)式
使用XPath

XPath是一門在XML文檔中查找信息的語(yǔ)言，用來(lái)在XML文檔中對(duì)元素和屬性進(jìn)行遍歷。

使用Chrome瀏覽器的開發(fā)人員工具，獲取XPath
使用第三方模塊BeautifulSoup

Beautiful Soup 是一個(gè)可以從HTML或XML文件中提取數(shù)據(jù)的Python庫(kù)

安裝pip install beautifulsoup4

# 處理數(shù)據(jù)
def parse_data():
    with open('index.html', mode='r', encoding='gbk') as f:
        html = f.read()

    #  創(chuàng)建BeautifulSoup實(shí)例，解析html數(shù)據(jù)
    bs = BeautifulSoup(html, 'html.parser')  # 指定使用html解析器parser

    '''
    查找數(shù)據(jù)
    '''
    # 1.find()方法，獲取第一個(gè)匹配的標(biāo)簽
    # div = bs.find('div')
    # print(div)
    # print(type(div))  # Tag類型

    # 2.find_all()方法，獲取所有匹配的標(biāo)簽
    # metas = bs.find_all('meta')  # 返回的是集合
    # print(metas[0])
    # print(bs.find_all(id='hello'))  # 根據(jù)id獲取，返回的是集合
    # print(bs.find_all(class_='itany'))  # 根據(jù)class獲取

    # 3.select()方法，使用CSS選擇器來(lái)獲取元素
    # print(bs.select('#hello'))
    # print(bs.select('.itany'))
    # print(bs.select('p#world span'))
    # print(bs.select('[title]'))

    # 4.get_text()方法，獲取Tag中的文本
    # value = bs.select('#hello')[0].get_text(strip=True)
    # print(len(value))
    # print(value)

    # 獲取職位信息
    divs = bs.select('#resultList .el')
    result = []
    for div in divs[1:]:
        title = div.select('.t1')[0].get_text(strip=True)
        company = div.select('.t2')[0].get_text(strip=True)
        addr = div.select('.t3')[0].get_text(strip=True)
        salary = div.select('.t4')[0].get_text(strip=True)
        pubDate = div.select('.t5')[0].get_text(strip=True)
        # print(title, company, addr, salary, pubDate)
        row = {
            'title': title,
            'company': company,
            'addr': addr,
            'salary': salary,
            'pubDate': pubDate
        }
        result.append(row)
    return result

3. 存儲(chǔ)數(shù)據(jù)

3.1 存儲(chǔ)MySQL

# 存儲(chǔ)數(shù)據(jù)到MySQL
def save_to_mysql(data):
    config = {
        'host': 'localhost',
        'port': 3306,
        'user': 'root',
        'password': '',
        'database': 'python',
        'charset': 'utf8'
    }
    conn = pymysql.connect(**config)
    cursor = conn.cursor()
    sql = '''
        insert into t_job
          (title, company, addr, salary, pubDate) 
        values 
          (%(title)s,%(company)s,%(addr)s,%(salary)s,%(pubDate)s)  
    '''
    cursor.executemany(sql, data)
    conn.commit()

    cursor.close()
    conn.close()

3.2 存儲(chǔ)到Excel

? 使用openpyxl模塊操作Excel

? 安裝openpyxl：pip install openpyxl

? 工作薄Workbook

? 工作表Sheet

? 單元格Cell

# 存儲(chǔ)數(shù)據(jù)到Excel
def save_to_excel(data):
    # 創(chuàng)建工作薄Workbook
    book = Workbook()

    # 創(chuàng)建工作表Sheet
    sheet = book.create_sheet('南京Java招聘信息', 0)

    # 向工作表中添加數(shù)據(jù)
    sheet.append(['職位名', '公司名', '工作地點(diǎn)', '薪資', '發(fā)布時(shí)間'])
    for item in data:
        row = [item['title'], item['company'], item['addr'], item['salary'], item['pubDate']]
        sheet.append(row)

    # 輸出保存
    book.save('51job.xlsx')

3.3 存儲(chǔ)到Redis

? 安裝redis庫(kù)：pip install redis

# 存儲(chǔ)數(shù)據(jù)到Redis
def save_to_redis(data):
    config = {
        'host': '192.168.2.30',
        'port': 6379,
        'charset': 'utf8'
    }
    r = redis.Redis(**config)
    # r.set('name', 'tom')
    for item in data:
        r.lpush('jobs', item)

# 從Redis中讀取數(shù)據(jù)
def read_from_redis():
    config = {
        'host': '192.168.2.30',
        'port': 6379,
        'charset': 'utf8',
        'decode_responses': True  # 讀取時(shí)解碼
    }
    r = redis.Redis(**config)
    print(r.lrange('jobs', 0, -1))

三、處理JSON數(shù)據(jù)

from urllib import request
import json

def get_data():
    url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=400&page_start=0'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
    }
    req = request.Request(url, headers=headers)
    response = request.urlopen(req)
    if response.getcode() == 200:
        result = response.read()
        # print(type(result))  # bytes類型
        return result

def parse_data(html):
    # 將字符串形式的json轉(zhuǎn)換為dict字典
    data = json.loads(html)
    # print(type(data), data)
    movies = data['subjects']
    for movie in movies:
        print(movie['title'], movie['rate'])

if __name__ == '__main__':
    parse_data(get_data())

四、爬蟲應(yīng)用

? 步驟：

獲取數(shù)據(jù)
處理數(shù)據(jù)
存儲(chǔ)數(shù)據(jù)
數(shù)據(jù)可視化

1. 電影評(píng)論數(shù)據(jù)分析


from urllib import request
import json
from datetime import datetime, timedelta
import time

# 獲取數(shù)據(jù)
def get_data(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
    }
    req = request.Request(url, headers=headers)
    response = request.urlopen(req)
    if response.getcode() == 200:
        return response.read()

# 處理數(shù)據(jù)
def parse_data(html):
    data = json.loads(html)['cmts']
    comments = []
    for item in data:
        comment = {
            'id': item['id'],
            'nickName': item['nickName'],
            'cityName': item['cityName'] if 'cityName' in item else '',  # 處理cityName不存在情況
            'content': item['content'].replace('\n', ' '),  # 處理評(píng)論內(nèi)容換行的情況
            'score': item['score'],
            'startTime': item['startTime']
        }
        comments.append(comment)
    return comments

# 存儲(chǔ)數(shù)據(jù)到文本文件
def save_to_txt():
    start_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')  # 當(dāng)前時(shí)間
    end_time = '2018-08-10 00:00:00'  # 結(jié)束時(shí)間
    while start_time > end_time:
        url = 'http://m.maoyan.com/mmdb/comments/movie/1203084.json?_v_=yes&offset=0&startTime=' + start_time.replace(
            ' ', '%20')
        try:
            html = get_data(url)
        except:
            time.sleep(1)
            html = get_data(url)
        else:
            time.sleep(0.1)

        comments = parse_data(html)
        print(comments)

        start_time = comments[14]['startTime']  # 末尾評(píng)論時(shí)間
        start_time = datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S') - timedelta(seconds=1)  # 向前減１秒，防止獲取到重復(fù)數(shù)據(jù)
        start_time = datetime.strftime(start_time, '%Y-%m-%d %H:%M:%S')

        for item in comments:
            with open('comments.txt', mode='a', encoding='utf-8') as f:
                f.write(str(item['id']) + ',' + item['nickName'] + ',' + item['cityName'] + ',' + item[
                    'content'] + ',' + str(item['score']) + ',' + item['startTime'] + '\n')

if __name__ == '__main__':
    # url = 'http://m.maoyan.com/mmdb/comments/movie/1203084.json?_v_=yes&offset=15&startTime=2018-09-01%2011%3A10%3A00'
    # comments = parse_data(get_data(url))
    # print(comments)
    save_to_txt()

2. 數(shù)據(jù)可視化

? pyecharts類庫(kù)

2.1 粉絲位置分布

from collections import Counter
from pyecharts import Geo
import json
from pyecharts import Bar

def render():
    # 獲取所有城市信息
    cities = []
    with open('comments.txt', mode='r', encoding='utf-8') as f:
        rows = f.readlines()
        for row in rows:
            city = row.split(',')[2]
            if city != '':
                cities.append(city)

    # 對(duì)城市數(shù)據(jù)和坐標(biāo)文件中的地名進(jìn)行處理
    handle(cities)

    # 統(tǒng)計(jì)每個(gè)城市出現(xiàn)的次數(shù)
    # data = []  # [('南京',25),('北京',59)]
    # for city in set(cities):
    #     data.append((city, cities.count(city)))
    data = Counter(cities).most_common()

    # 根據(jù)城市數(shù)據(jù)生成地理坐標(biāo)圖
    geo = Geo(
        "《一出好戲》粉絲位置分布",
        "數(shù)據(jù)來(lái)源：貓眼",
        title_color="#fff",
        title_pos="center",
        width=1200,
        height=600,
        background_color="#404a59",
    )
    attr, value = geo.cast(data)
    geo.add(
        "",
        attr,
        value,
        visual_range=[0, 3500],
        visual_text_color="#fff",
        symbol_size=15,
        is_visualmap=True,
    )
    geo.render('粉絲位置分布.html')

    # 根據(jù)城市數(shù)據(jù)生成柱狀圖
    cities_top20 = Counter(cities).most_common(20)  # 返回出現(xiàn)次數(shù)最多的20條
    bar = Bar("《一出好戲》粉絲來(lái)源排行榜TOP20", '數(shù)據(jù)來(lái)源：貓眼', title_pos='center', width=1200, height=600)
    attr, value = bar.cast(cities_top20)
    bar.add("", attr, value)
    bar.render('粉絲來(lái)源排行榜-柱狀圖.html')

# 處理地名數(shù)據(jù)，解析坐標(biāo)文件中找不到地名的問題
def handle(cities):
    with open(
            'C:/Users/User/PycharmProjects/python-spider/venv/Lib/site-packages/pyecharts/datasets/city_coordinates.json',
            mode='r', encoding='utf-8') as f:
        data = json.loads(f.read())  # 將str轉(zhuǎn)換為dict

    # 循環(huán)判斷處理
    data_new = data.copy()  # 復(fù)制一份地名數(shù)據(jù)
    for city in set(cities):
        count = 0
        for k in data:
            count += 1
            if k == city:
                break
            if k.startswith(city):  # 處理簡(jiǎn)寫的地名，如南京市 簡(jiǎn)寫為 南京
                data_new[city] = data[k]
                break
            if k.startswith(city[0:-1]) and len(city) >= 3:  # 處理行政變更的地名，如溧水縣 改為 溧水區(qū)
                data_new[city] = data[k]
                break
        # 處理不存在的情況
        if count == len(data):
            while city in cities:
                cities.remove(city)
    # print(len(data), len(data_new))

    # 寫入覆蓋坐標(biāo)文件
    with open(
            'C:/Users/User/PycharmProjects/python-spider/venv/Lib/site-packages/pyecharts/datasets/city_coordinates.json',
            mode='w', encoding='utf-8') as f:
        f.write(json.dumps(data_new, ensure_ascii=False))  # 將dict轉(zhuǎn)換為str，指定ensure_ascii=False支持中文

if __name__ == '__main__':
    render()

2.2 評(píng)價(jià)星級(jí)

from pyecharts import Pie

#  獲取評(píng)論中所有評(píng)分
rates = []
with open('comments.txt', mode='r', encoding='utf-8') as f:
    rows = f.readlines()
    for row in rows:
        rates.append(row.split(',')[4])

# print(rates)

# 定義星級(jí)
attr = ['五星', '四星', '三星', '二星', '一星']
value = [
    rates.count('5') + rates.count('4.5'),
    rates.count('4') + rates.count('3.5'),
    rates.count('3') + rates.count('2.5'),
    rates.count('2') + rates.count('1.5'),
    rates.count('1') + rates.count('0.5')
]
# print(value)

pie = Pie("《一出好戲》評(píng)分星級(jí)", title_pos='center', width=900)
pie.add("", attr, value, is_label_show=True, is_legend_show=False)
pie.render('電影評(píng)分-餅圖.html')

2.3 詞云圖

? jieba(結(jié)巴)是一個(gè)強(qiáng)大的分詞庫(kù),完美支持中文分詞

? Matplotlib 是一個(gè)Python的 2D繪圖庫(kù)，可以生成繪圖，直方圖，功率譜，條形圖，錯(cuò)誤圖，散點(diǎn)圖等

? wordcloud基于Python的詞云生成類庫(kù),很好用,而且功能強(qiáng)大

import jieba
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

# 獲取所有評(píng)論內(nèi)容
comments = []
with open('comments.txt', mode='r', encoding='utf-8') as f:
    rows = f.readlines()
    for row in rows:
        comment = row.split(',')[3]
        if comment != '':
            comments.append(comment)

# 設(shè)置分詞
comment_after_split = jieba.cut(str(comments), cut_all=False)
words = ' '.join(comment_after_split)  # 以空格進(jìn)行拼接
# print(words)

# 設(shè)置屏蔽詞匯
stopwords = STOPWORDS.copy()
stopwords.add('電影')
stopwords.add('一出')
stopwords.add('好戲')
stopwords.add('有點(diǎn)')

# 導(dǎo)入背景圖
bg_image = plt.imread('love.jpg')

# 設(shè)置詞云參數(shù)
wc = WordCloud(width=1024, height=768, background_color='white', mask=bg_image, stopwords=stopwords, max_font_size=400,
               random_state=50,font_path='STKAITI.TTF')
# 將分詞后數(shù)據(jù)導(dǎo)入云圖
wc.generate_from_text(words)
#  繪制圖像
plt.imshow(wc)
plt.axis('off')  # 不顯示坐標(biāo)軸
plt.show()  # 顯示圖像
# 保存圖像到文件
wc.to_file('詞云圖.jpg')

向AI問一下細(xì)節(jié)