python3 re怎么用來提取大量的數(shù)據(jù)

發(fā)布時間：2020-11-21 09:14:24 來源：億速云閱讀：168 作者：小新欄目：編程語言
這篇文章主要介紹了python3 re怎么用來提取大量的數(shù)據(jù)，具有一定借鑒價值，需要的朋友可以參考下。希望大家閱讀完這篇文章后大有收獲。下面讓小編帶著大家一起了解一下。
具體代碼：
re_label_script
# -*- coding:utf-8 -*-
# 自定義創(chuàng)建文件夾并保存圖片
import re
import os
from urllib.request import urlretrieve
 
content = '''
<script>var images = [
{ "big":"http://i-2.yxdown.com/2015/3/18/KDkwMHgp/6381ccc0-ed65-4422-8671-b3158d6ad23e.jpg";;,
  "thumb":"http://i-2.yxdown.com/2015/3/18/KHgxMjAp/6381ccc0-ed65-4422-8671-b3158d6ad23e.jpg";;,
  "original":"http://i-2.yxdown.com/2015/3/18/6381ccc0-ed65-4422-8671-b3158d6ad23e.jpg";;,
  "title":"","descript":"","id":75109},
{ "big":"http://i-2.yxdown.com/2015/3/18/KDkwMHgp/fec26de9-8727-424a-b272-f2827669a320.jpg";;,
  "thumb":"http://i-2.yxdown.com/2015/3/18/KHgxMjAp/fec26de9-8727-424a-b272-f2827669a320.jpg";;,
  "original":"http://i-2.yxdown.com/2015/3/18/fec26de9-8727-424a-b272-f2827669a320.jpg";;,
  "title":"","descript":"","id":75110},
</script>
'''
 
# 自定義函數(shù)，在創(chuàng)建新的文件夾
# 固定，可直接套用
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def mkdir(path):
    # 去除首位空格
    path = path.strip()
    # 去除尾部符號 ‘\\’
    path = path.rstrip("\\")
    # 判斷路徑是否存在
    isExists = os.path.exists(path)
    # 去掉目錄路徑，返回文件夾名
    fp_new = os.path.basename(path)
    if not isExists:
        # 如果不存在，則創(chuàng)建目錄 os.makedirs(path)
        os.makedirs(path)
        print(path + '  新文件夾' + fp_new + '創(chuàng)建成功')
        return True
    else:
        # 如果目錄存在則不創(chuàng)建
        print(path + '  文件夾' + fp_new + '已存在')
        return False
 
# 當(dāng)前路徑下創(chuàng)建文件夾用來保存圖片
# 獲取當(dāng)前路徑dir_path
dir_path = os.path.abspath(".")
# dir_new 絕對路徑
dir_new = dir_path + '\\pic_down'           # 新建文件夾的名字
# 傳參并創(chuàng)建新文件夾在當(dāng)前路徑下，文件夾名稱為pic_down
mkdir(dir_new)
# 固定，可直接套用
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
html_script = r'<script>(.*?)</script>'
info_script = re.findall(html_script, content, re.S|re.M)
for script in info_script:
    res_original = r'"original":"(.*?)"'  # 原圖
    pic_script = re.findall(res_original, script)
    for pic in pic_script:
        print(pic)
        # urlretrieve()函數(shù)下載圖片
        filename = os.path.basename(pic)                    # 去掉目錄路徑，返回文件名
        urlretrieve(pic, dir_new + '\\' +filename)        #下載圖片
 
 
 
re_label_span 過濾<span></span>等標(biāo)簽
# -*- coding:utf-8 -*-
# 過濾<span></span>等標(biāo)簽
import re
 
language = '''
<table class="infobox bordered vcard" style="width: 21em; font-size: 89%; text-align: left;" cellpadding="3">
<caption style="text-align: center; font-size: larger;"><b>周恩來</b></caption>
<tr>
<th>性別：</th>
<td>男</td>d
</tr>
<tr>
<th>異名：</th>
<td><span>(字) 翔宇</span></td>
</tr>
<tr>
<th>政黨：</th>
<td><span><a href="../articles/%E4%B8%AD9A.html" title="中國共產(chǎn)黨">中國共產(chǎn)黨</a></span></td>
</tr>
<tr>
<th>籍貫：</th>
<td><a href="../articles/%E6%B5%9981.html" title="浙江省">浙江省</a><a href="../articles/%E7%BB%8D82.html" title="紹興市">紹興市</a></td>
</tr>
</table>
'''
 
# 獲取table中的tr值
res_tr = r"<tr>(.*?)</tr>"
info_tr = re.findall(res_tr, language, re.S|re.M)
for line in info_tr:
    # 獲取表格第一列 th 屬性
    res_th = r"<th>(.*?)</th>"
    info_th = re.findall(res_th, line, re.S|re.M)
    # print(info_th)          # ['性別：']\n['異名：']\n['政黨：']\n['籍貫：']
    for mm in info_th:
        # 處理掉href鏈接
        if "href" in mm:    # 如果href鏈接存在info_th中，則處理
            restr = r'<a href=.*?>(.*?)</a>'  # 只獲取a標(biāo)簽的內(nèi)容，不獲取鏈接 re.findall() （.*？）
            h = re.findall(restr, mm, re.S|re.M)
            print(h[0])    # 為什么加逗號
        else:
            print(mm)      # 為什么加逗號
 
    # 獲取表格第二列 th 屬性
    res_td = r'<td>(.*?)</td>'
    info_td = re.findall(res_td, line, re.S|re.M)
    for nn in info_td:            # 兩個if判斷的先后順序
        # 處理掉href鏈接或者rel等信息 (對于政黨中既有span 又有a標(biāo)簽，由于內(nèi)容是在a標(biāo)簽中，不須考慮span的影響)
        if "href" in nn:        # 判斷內(nèi)容直接所屬的標(biāo)簽
            res_value = r'<a .*?>(.*?)</a>'     # 處理<a href=../rel=..></a>等信息
            td_value = re.findall(res_value, nn, re.S|re.M)
            # print(td_value)
            for value in td_value:              # 一個td中可能會有多個href或者rel等信息
                print(value)
        elif "span" in nn:
            res_value = r'<span .*?>(.*?)</span>'     # 對于政黨中，由于已經(jīng)先判斷了href，故不會執(zhí)行到elif span中
            td_value = re.findall(res_value, nn, re.S|re.M)
            for value in td_value:
                print(value)
        else:
            print(nn)
 
 
 
 
 
re_label_sub img_replace br (過濾掉換行符)
# -*- coding:utf-8 -*-
# 獲取<img ../>中超鏈接及過濾<img>標(biāo)簽
import os
import re
 
value = '''
<table style="width: 21em; text-align: left;" cellpadding="3">
<tr bgcolor="#CDDBE8">
<th colspan="2">
<center><b>中華民國政治人士</b><br /></center>
</th>
</tr>
<tr>
<th>性別：</th>
<td>男</td>
</tr>
<tr>
<th>政黨：</th>
<td><span>
<img alt="中國國民黨" src="../../../../images/Kuomintang.svg.png" width="19" height="19" border="0" />
<a href="../../../../articles/%8B%E6%B0%91%E9%BB%A8.html" title="中國國民黨">中國國民黨</a></span></td>
</tr>
</table>
'''
# # 過濾HTML標(biāo)簽 ,<>包含的內(nèi)容全部替換為空值
# value = re.sub('<[^>]+>', '', value) # 過濾HTML標(biāo)簽 ,<>包含的內(nèi)容全部替換為空值
# print(value)
 
# 先過濾掉上訴替換空值后的換行符</br>
if '</br>' in value or '\n' in value:
    value = value.replace('</br>', '') # </br>替換為空值
    value = value.replace('\n', ' ') # \n替換為空格
 
value = re.sub('<[^>]+>', '', value) # <>包含的內(nèi)容全部替換為空值, 首位有空格
# 中華民國政治人士 性別： 男 政黨： 中國國民黨
value = value.strip() # 去掉value首尾的空格
# 中華民國政治人士 性別： 男 政黨： 中國國民黨
print(value)
 
 
 
 
re_label_table
# -*- coding:utf-8 -*-
import re
 
s = '''<table>
<tr>
<td>序列號</td><td>DEIN3-39CD3-2093J3</td>
<td>日期</td><td>2013年1月22日</td>
<td>售價</td><td>392.70 元</td>
<td>說明</td><td>僅限5用戶使用</td>
</tr>
</table>
'''
info = re.findall(r"<td>(.*?)</td><td>(.*?)</td>", s, re.S|re.M)
for line in info:
    print(line[0],line[1])        #或者print(line) 一樣的結(jié)果
# 序列號 DEIN3-39CD3-2093J3
# 日期 2013年1月22日
# 售價 392.70 元
# 說明 僅限5用戶使用
 
#     print(line[1])
# DEIN3-39CD3-2093J3
# 2013年1月22日
# 392.70 元
# 僅限5用戶使用
 
 
 
re_label_title
# -*- coding:utf-8 -*-
import re
from urllib.request import urlopen
 
request = urlopen("http://www.csdn.net/";).read().decode('utf-8';)
 
print("方法一：")           # re.search()   撇配第一個
title_pat = r"<a.*?title=.*?(?=target=)"
title_obj = re.search(title_pat, request, re.I|re.M)
title = title_obj.group()
print(title)             # re.search() 只匹配滿足條件的第一條記錄
# <a title="理解情感?—?從Keras移植到pyTorch" href="http://geek.csdn.net/news/detail/239227";;
 
 
print("方法二：")
title_obj = re.findall(r"(?<=<a )title=.*?(?=target=)", request, re.I|re.M)
print(title_obj[0])
# title="理解情感?—?從Keras移植到pyTorch" href="http://geek.csdn.net/news/detail/239227";;
 
 
 
re_label_tr(td/th)
作者： Klaus_Lyu
# -*- coding:utf-8 -*-
import re
 
language = '''<tr><th>性別：</th><td>男</td></tr><tr><th>性別：</th><td>女</td></tr>'''
# 正則表達(dá)式獲取<tr></tr>之間內(nèi)容
# 核心代碼：
res_tr = r'<tr>(.*?)</tr>'
m_tr = re.findall(res_tr, language, re.S|re.M)
# /核心代碼
 
for line in m_tr:
    print(line)
 
# 獲取表格第一列th屬性
res_th = r'<th>(.*?)</th>'
m_th = re.findall(res_th, line, re.S|re.M)
for mm in m_th:
    print(mm)
 
# 獲取表格第二列td屬性
res_td = r'<td>(.*?)</td>'
m_td = re.findall(res_td, line, re.S|re.M)
for nn in m_td:
    print(nn)
# results:
# < th > 性別： < / th > < td > 男 < / td >
# 性別：
# 男
# < th > 性別： < / th > < td > 女 < / td >
# 性別：
# 女
 
 
 
 
 
 
re_label_head
作者： Klaus_Lyu
# -*- coding:utf-8 -*-
import re
 
content = """<head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
    <title>
豆瓣電影 Top 250
</title>
    <meta http-equiv="Expires" content="Sun, 6 Mar 2005 01:00:00 GMT">
    <link rel="apple-touch-icon" href="https://img3.doubanio.com/f/movie/d59b2715fdea4968a450ee5f6c95c7d7a2030065/pics/movie/apple-touch-icon.png">;;
    <script type="text/javascript">var _head_start = new Date();</script>
<link href="https://img3.doubanio.com/f/movie/dcfd6c93a0b44f2495c6ab3cdf21d8508b97bb03/css/movie/top_movies.css";; rel="stylesheet" type="text/css" />
    <style type="text/css">img { max-width: 100%; }</style>
    <script type="text/javascript"></script>
    <link rel="shortcut icon" href="https://img3.doubanio.com/favicon.ico";; type="image/x-icon">
</head>"""
 
# 核心代碼：
# regex = r'<meta http-equiv=.*?>'
# regex = r'(?<=<meta )http-equiv=.*?(?=>)'
 
 
# regex = r'(?<=link.*?)href=".*?(?=")|(?<=link.*?)href=\'.*?(?=\')'
# 前提條件(?<=link.*?)不對，必須是確定的，不能用匹配的表達(dá)式，正確的是（?<=link ）
link_href = re.findall(r'(?<=<link ).*?href="(.*?)(?=")', content, re.M|re.S)   # refindall()只匹配括號里的(.*？)
for line in link_href:
    print(line)
    # https://img3.doubanio.com/f/movie/d59b2715fdea4968a450ee5f6c95c7d7a2030065/pics/movie/apple-touch-icon.png
    # https://img3.doubanio.com/f/movie/dcfd6c93a0b44f2495c6ab3cdf21d8508b97bb03/css/movie/top_movies.css
    # https://img3.doubanio.com/favicon.ico
 
link_metal = re.findall(r'(?<=<meta )http-equiv=.*?(?=>)', content,)
for line in link_metal:
    print(line)
    # http - equiv = "Content-Type" content = "text/html; charset=utf-8"
    # http - equiv = "Expires"  content = "Sun, 6 Mar 2005 01:00:00 GMT"
 
link_metal = re.findall(r'<meta http-equiv=.*?>', content,)
for line in link_metal:
    print(line)
    # <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
    # <meta http-equiv="Expires" content="Sun, 6 Mar 2005 01:00:00 GMT">
 
 
 
re_label_href
# -*- coding:utf-8 -*-
import re
from urllib.request import urlopen
 
# !!!!!爬取豆瓣top250首頁的源代碼
# 自定義函數(shù)獲取網(wǎng)頁源代碼，自動獲取網(wǎng)站編碼格式并按相應(yīng)格式解碼賦值給request
def download(html):
    urlorgs = urlopen(html).read()
    # 檢測url的編碼格式
    # char_url = chardet.detect(urlorgs)
    # print(char_url['encoding'])
    # print(char_url)    # {'encoding': 'GB2312', 'confidence': 0.99, 'language': 'Chinese'}
                         # url按照對應(yīng)的編碼格式進(jìn)行解碼輸出, chardet.detect()內(nèi)容為key-value字典
    # request = urlorgs.decode(char_url['encoding'])
    request = urlorgs.decode('utf-8')
    return request
 
# 傳參 html,獲得正確編碼后的網(wǎng)頁源代碼
request = download("https://movie.douban.com/top250";;)
# print(request)
 
# 爬鏈接要有針對性地根據(jù)目標(biāo)值來觀察鏈接的特征，依據(jù)特征進(jìn)行針對性地爬取
# # 爬取a標(biāo)簽中所有URL鏈接 （目測意義不大）
urls = re.findall(r"<a.*?href=.*?</a>", request, re.I|re.M)
for url in urls:
    print(url)
 
# 爬取所有href前綴的link（目測也是意義不大）
link_list = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')", request)
for url in link_list:
    print(url)
感謝你能夠認(rèn)真閱讀完這篇文章，希望小編分享python3 re怎么用來提取大量的數(shù)據(jù)內(nèi)容對大家有幫助，同時也希望大家多多支持億速云，關(guān)注億速云行業(yè)資訊頻道，遇到問題就找億速云，詳細(xì)的解決方法等著你來學(xué)習(xí)!
向AI問一下細(xì)節(jié)
python3 re怎么用來提取大量的數(shù)據(jù)

猜你喜歡

最新資訊

相關(guān)推薦

相關(guān)標(biāo)簽