您好,登錄后才能下訂單哦!
本篇內(nèi)容介紹了“Python爬蟲(chóng)入門(mén)案例之實(shí)現(xiàn)爬取二手房源數(shù)據(jù)”的有關(guān)知識(shí),在實(shí)際案例的操作過(guò)程中,不少人都會(huì)遇到這樣的困境,接下來(lái)就讓小編帶領(lǐng)大家學(xué)習(xí)一下如何處理這些情況吧!希望大家仔細(xì)閱讀,能夠?qū)W有所成!
系統(tǒng)分析網(wǎng)頁(yè)性質(zhì)
結(jié)構(gòu)化的數(shù)據(jù)解析
csv數(shù)據(jù)保存
python 3.8
pycharm 專業(yè)版 >>> 激活碼
#模塊使用
requests >>> pip install requests
parsel >>> pip install parsel
csv
【付費(fèi)VIP完整版】只要看了就能學(xué)會(huì)的教程,80集Python基礎(chǔ)入門(mén)視頻教學(xué)
點(diǎn)這里即可免費(fèi)在線觀看
爬蟲(chóng)代碼實(shí)現(xiàn)步驟: 發(fā)送請(qǐng)求 >>> 獲取數(shù)據(jù) >>> 解析數(shù)據(jù) >>> 保存數(shù)據(jù)
import requests # 數(shù)據(jù)請(qǐng)求模塊 第三方模塊 pip install requests import parsel # 數(shù)據(jù)解析模塊 import re import csv
url = 'https://bj.lianjia.com/ershoufang/pg1/' # 需要攜帶上 請(qǐng)求頭: 把python代碼偽裝成瀏覽器 對(duì)于服務(wù)器發(fā)送請(qǐng)求 # User-Agent 瀏覽器的基本信息 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36' } response = requests.get(url=url, headers=headers)
print(response.text)
selector_1 = parsel.Selector(response.text) # 把獲取到response.text 數(shù)據(jù)內(nèi)容轉(zhuǎn)成 selector 對(duì)象 href = selector_1.css('div.leftContent li div.title a::attr(href)').getall() for link in href: html_data = requests.get(url=link, headers=headers).text selector = parsel.Selector(html_data) # css選擇器 語(yǔ)法 # try: title = selector.css('.title h2::text').get() # 標(biāo)題 area = selector.css('.areaName .info a:nth-child(1)::text').get() # 區(qū)域 community_name = selector.css('.communityName .info::text').get() # 小區(qū) room = selector.css('.room .mainInfo::text').get() # 戶型 room_type = selector.css('.type .mainInfo::text').get() # 朝向 height = selector.css('.room .subInfo::text').get().split('/')[-1] # 樓層 # 中樓層/共5層 split('/') 進(jìn)行字符串分割 ['中樓層', '共5層'] [-1] # ['中樓層', '共5層'][-1] 列表索引位置取值 取列表中最后一個(gè)元素 共5層 # re.findall('共(\d+)層', 共5層) >>> [5][0] >>> 5 height = re.findall('共(\d+)層', height)[0] sub_info = selector.css('.type .subInfo::text').get().split('/')[-1] # 裝修 Elevator = selector.css('.content li:nth-child(12)::text').get() # 電梯 # if Elevator == '暫無(wú)數(shù)據(jù)電梯' or Elevator == None: # Elevator = '無(wú)電梯' house_area = selector.css('.content li:nth-child(3)::text').get().replace('㎡', '') # 面積 price = selector.css('.price .total::text').get() # 價(jià)格(萬(wàn)元) date = selector.css('.area .subInfo::text').get().replace('年建', '') # 年份 dit = { '標(biāo)題': title, '市區(qū)': area, '小區(qū)': community_name, '戶型': room, '朝向': room_type, '樓層': height, '裝修情況': sub_info, '電梯': Elevator, '面積(㎡)': house_area, '價(jià)格(萬(wàn)元)': price, '年份': date, } csv_writer.writerow(dit) print(title, area, community_name, room, room_type, height, sub_info, Elevator, house_area, price, date, sep='|')
f = open('二手房數(shù)據(jù).csv', mode='a', encoding='utf-8', newline='') csv_writer = csv.DictWriter(f, fieldnames=[ '標(biāo)題', '市區(qū)', '小區(qū)', '戶型', '朝向', '樓層', '裝修情況', '電梯', '面積(㎡)', '價(jià)格(萬(wàn)元)', '年份', ]) csv_writer.writeheader()
import pandas as pd from pyecharts.charts import Map from pyecharts.charts import Bar from pyecharts.charts import Line from pyecharts.charts import Grid from pyecharts.charts import Pie from pyecharts.charts import Scatter from pyecharts import options as opts
df = pd.read_csv('鏈家.csv', encoding = 'utf-8') df.head()
new = [x + '區(qū)' for x in region] m = ( Map() .add('', [list(z) for z in zip(new, count)], '北京') .set_global_opts( title_opts=opts.TitleOpts(title='北京市二手房各區(qū)分布'), visualmap_opts=opts.VisualMapOpts(max_=3000), ) ) m.render_notebook()
df_price.values.tolist() price = [round(x,2) for x in df_price.values.tolist()] bar = ( Bar() .add_xaxis(region) .add_yaxis('數(shù)量', count, label_opts=opts.LabelOpts(is_show=True)) .extend_axis( yaxis=opts.AxisOpts( name="價(jià)格(萬(wàn)元)", type_="value", min_=200, max_=900, interval=100, axislabel_opts=opts.LabelOpts(formatter="{value}"), ) ) .set_global_opts( title_opts=opts.TitleOpts(title='各城區(qū)二手房數(shù)量-平均價(jià)格柱狀圖'), tooltip_opts=opts.TooltipOpts( is_show=True, trigger="axis", axis_pointer_type="cross" ), xaxis_opts=opts.AxisOpts( type_="category", axispointer_opts=opts.AxisPointerOpts(is_show=True, type_="shadow"), ), yaxis_opts=opts.AxisOpts(name='數(shù)量', axistick_opts=opts.AxisTickOpts(is_show=True), splitline_opts=opts.SplitLineOpts(is_show=False),) ) ) line2 = ( Line() .add_xaxis(xaxis_data=region) .add_yaxis( series_name="價(jià)格", yaxis_index=1, y_axis=price, label_opts=opts.LabelOpts(is_show=True), z=10 ) ) bar.overlap(line2) grid = Grid() grid.add(bar, opts.GridOpts(pos_left="5%", pos_right="20%"), is_control_axis_index=True) grid.render_notebook()
area0 = top_price['小區(qū)'].values.tolist() count = top_price['價(jià)格(萬(wàn)元)'].values.tolist() bar = ( Bar() .add_xaxis(area0) .add_yaxis('數(shù)量', count,category_gap = '50%') .set_global_opts( yaxis_opts=opts.AxisOpts(name='價(jià)格(萬(wàn)元)'), xaxis_opts=opts.AxisOpts(name='數(shù)量'), ) ) bar.render_notebook()
s = ( Scatter() .add_xaxis(df['面積(㎡)'].values.tolist()) .add_yaxis('',df['價(jià)格(萬(wàn)元)'].values.tolist()) .set_global_opts(xaxis_opts=opts.AxisOpts(type_='value')) ) s.render_notebook()
directions = df_direction.index.tolist() count = df_direction.values.tolist() c1 = ( Pie(init_opts=opts.InitOpts( width='800px', height='600px', ) ) .add( '', [list(z) for z in zip(directions, count)], radius=['20%', '60%'], center=['40%', '50%'], # rosetype="radius", label_opts=opts.LabelOpts(is_show=True), ) .set_global_opts(title_opts=opts.TitleOpts(title='房屋朝向占比',pos_left='33%',pos_top="5%"), legend_opts=opts.LegendOpts(type_="scroll", pos_left="80%",pos_top="25%",orient="vertical") ) .set_series_opts(label_opts=opts.LabelOpts(formatter=':{c} (otbe487%)'),position="outside") ) c1.render_notebook()
fitment = df_fitment.index.tolist() count1 = df_fitment.values.tolist() directions = df_direction.index.tolist() count2 = df_direction.values.tolist() bar = ( Bar() .add_xaxis(fitment) .add_yaxis('', count1, category_gap = '50%') .reversal_axis() .set_series_opts(label_opts=opts.LabelOpts(position='right')) .set_global_opts( xaxis_opts=opts.AxisOpts(name='數(shù)量'), title_opts=opts.TitleOpts(title='裝修情況/有無(wú)電梯玫瑰圖(組合圖)',pos_left='33%',pos_top="5%"), legend_opts=opts.LegendOpts(type_="scroll", pos_left="90%",pos_top="58%",orient="vertical") ) ) c2 = ( Pie(init_opts=opts.InitOpts( width='800px', height='600px', ) ) .add( '', [list(z) for z in zip(directions, count2)], radius=['10%', '30%'], center=['75%', '65%'], rosetype="radius", label_opts=opts.LabelOpts(is_show=True), ) .set_global_opts(title_opts=opts.TitleOpts(title='有/無(wú)電梯',pos_left='33%',pos_top="5%"), legend_opts=opts.LegendOpts(type_="scroll", pos_left="90%",pos_top="15%",orient="vertical") ) .set_series_opts(label_opts=opts.LabelOpts(formatter=':{c} \n (bll6n6p%)'),position="outside") ) bar.overlap(c2) bar.render_notebook()
floor = df_floor.index.tolist() count = df_floor.values.tolist() bar = ( Bar() .add_xaxis(floor) .add_yaxis('數(shù)量', count) .set_global_opts( title_opts=opts.TitleOpts(title='二手房樓層分布柱狀縮放圖'), yaxis_opts=opts.AxisOpts(name='數(shù)量'), xaxis_opts=opts.AxisOpts(name='樓層'), datazoom_opts=opts.DataZoomOpts(type_='slider') ) ) bar.render_notebook()
area = df_area.index.tolist() count = df_area.values.tolist() bar = ( Bar() .add_xaxis(area) .add_yaxis('數(shù)量', count) .reversal_axis() .set_series_opts(label_opts=opts.LabelOpts(position="right")) .set_global_opts( title_opts=opts.TitleOpts(title='房屋面積分布縱向柱狀圖'), yaxis_opts=opts.AxisOpts(name='面積(㎡)'), xaxis_opts=opts.AxisOpts(name='數(shù)量'), ) ) bar.render_notebook()
“Python爬蟲(chóng)入門(mén)案例之實(shí)現(xiàn)爬取二手房源數(shù)據(jù)”的內(nèi)容就介紹到這里了,感謝大家的閱讀。如果想了解更多行業(yè)相關(guān)的知識(shí)可以關(guān)注億速云網(wǎng)站,小編將為大家輸出更多高質(zhì)量的實(shí)用文章!
免責(zé)聲明:本站發(fā)布的內(nèi)容(圖片、視頻和文字)以原創(chuàng)、轉(zhuǎn)載和分享為主,文章觀點(diǎn)不代表本網(wǎng)站立場(chǎng),如果涉及侵權(quán)請(qǐng)聯(lián)系站長(zhǎng)郵箱:is@yisu.com進(jìn)行舉報(bào),并提供相關(guān)證據(jù),一經(jīng)查實(shí),將立刻刪除涉嫌侵權(quán)內(nèi)容。