溫馨提示×

您好,登錄后才能下訂單哦!

密碼登錄×
登錄注冊(cè)×
其他方式登錄
點(diǎn)擊 登錄注冊(cè) 即表示同意《億速云用戶(hù)服務(wù)條款》

OkHttpClient和Jsoup進(jìn)行網(wǎng)頁(yè)爬取

發(fā)布時(shí)間:2020-07-21 11:21:38 來(lái)源:網(wǎng)絡(luò) 閱讀:610 作者:lifeneedyou 欄目:編程語(yǔ)言
通過(guò)http請(qǐng)求,返回一個(gè)json格式的數(shù)據(jù),然后將json數(shù)據(jù)轉(zhuǎn)化為java對(duì)象返回給調(diào)用方。Http采用OkHttp庫(kù),json轉(zhuǎn)化采用fastjson庫(kù)。

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>com.ok.http.client</groupId>
<artifactId>okhttp</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>

<name>okhttp</name>
<url>http://maven.apache.org</url>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>

<dependencies>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>

<dependency>
    <groupId>org.mongodb</groupId>
    <artifactId>bson</artifactId>
    <version>3.6.4</version>
</dependency>

<dependency>
    <groupId>com.google.cloud.trace.instrumentation.jdbc</groupId>
    <artifactId>driver</artifactId>
    <version>0.1.1</version>
    <type>pom</type>
</dependency>
<dependency>
    <groupId>ch.qos.logback.contrib</groupId>
    <artifactId>logback-mongodb-access</artifactId>
    <version>0.1.5</version>
</dependency>
<!-- MongoDB數(shù)據(jù)庫(kù)連接驅(qū)動(dòng) -->
<dependency>
    <groupId>org.mongodb</groupId>
    <artifactId>mongo-java-driver</artifactId>
    <version>3.0.0</version>
</dependency>
<dependency>
    <groupId>com.squareup.okio</groupId>
    <artifactId>okio</artifactId>
    <version>1.11.0</version>

</dependency>
<dependency>
    <groupId>com.squareup.okhttp3</groupId>
    <artifactId>okhttp</artifactId>
    <version>3.6.0</version>
</dependency>

<dependency>
    <groupId>com.alibaba</groupId>
    <artifactId>fastjson</artifactId>
    <version>1.2.47</version>
</dependency>

<dependency>
  <groupId>junit</groupId>
  <artifactId>junit</artifactId>
  <version>3.8.1</version>
  <scope>test</scope>
</dependency>

</dependencies>
</project>

package com.ok.http.client.okhttp;

import java.util.List;
import java.util.Map;

public class ExecuteTask {
public static void main(String[] args) throws Exception {
// 調(diào)用downloadHtml下載網(wǎng)頁(yè)
CrawlData crawlData = new CrawlData();
String url = null;
url = "http://top.chinaz.com/all/index.html";
System.out.println("開(kāi)始爬取,請(qǐng)等待.");
String htmlBody = crawlData.downloadHtml(url);
System.out.println("爬取成功");
// 將下載的數(shù)據(jù)進(jìn)行分析
List<Map<String, Object>> dataList = Analysis.analysisData(htmlBody);
System.out.println("數(shù)據(jù)解析成功");
for (Map<String, Object> data : dataList) {
StoreData.adds(data);
System.out.println("存儲(chǔ)成功");
}
}
}

package com.ok.http.client.okhttp;

import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;

/**

  • Hello world!
  • */
    public class CrawlData
    {
    public static String downloadHtml(String url) {
    String body = null;
    OkHttpClient client = new OkHttpClient();
    //請(qǐng)求
    Request request = new Request.Builder().url(url).build();
    //發(fā)起請(qǐng)求
    try {

        Response response = client.newCall(request).execute();
        body = new String(response.body().bytes());
    
    } catch (Exception e) {
        e.printStackTrace();
    }
    return body;//取得目標(biāo)

    }
    }

    package com.ok.http.client.okhttp;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.*;

public class Analysis {
/**

  • 解析數(shù)據(jù)
  • @param htmlBody
  • @return
  • @throws IOException
    */
    public static List<Map<String, Object>> analysisData(String htmlBody) throws Exception {
    // 獲取目標(biāo)HTML代碼
    List<Map<String, Object>> list = new ArrayList<Map<String, Object>>();
    Document doc = Jsoup.parse(htmlBody);
    Elements elements = doc.select("ul.listCentent").select("li");
    System.out.println(" 數(shù)據(jù)集合大小=====" + elements.size());
    for (Element elmemt : elements) {
    Map<String, Object> map1 = new HashMap<String, Object>();
    // 獲取公司名
    String siteName = elmemt.select("div.CentTxt > h4.rightTxtHead > a").text();
    System.out.println("siteName=====" + siteName);
    // 獲取域名
    String domainName = elmemt.select("div.CentTxt > h4.rightTxtHead > span").text();
    System.out.println("domainName=====" + domainName);
    // 獲取AlexaRank排名
    String AlexaRank = elmemt.select("li.clearfix >div.CentTxt > div.RtCPart >p").text();
    System.out.println("AlexaRank=====" + AlexaRank);
    // 獲取公司簡(jiǎn)介
    String Synopsis = elmemt.select("div.CentTxt> p").text();
    System.out.println("公司簡(jiǎn)介====" + Synopsis);
    // 獲取得分
    String score = elmemt.select("div.RtCRateCent>span").text();
    System.out.println(score);
    // 獲取排名
    String siteRank = elmemt.select("div.RtCRateCent> strong").text();
    System.out.println("排名:" + siteRank);
    // 獲取網(wǎng)址
    String webSite = "http://top.chinaz.com" + elmemt.select("a").first().attr("href");
    System.out.println("網(wǎng)址:" + webSite);
    // 獲取備案信息
    String stringecordInformation = getGecordInformation(webSite);
    System.out.println("備案信息" + stringecordInformation);
    System.out.println("\t");
    // StoreData.add(siteName,domainName, AlexaRank , Synopsis, score, siteRank, webSite ,RecordInformation);
    map1.put("siteName", siteName);
    map1.put("domainName", domainName);
    map1.put("AlexaRank", AlexaRank);
    map1.put("公司簡(jiǎn)介", Synopsis);
    map1.put("排名", siteRank);
    map1.put("網(wǎng)址", webSite);
    map1.put("備案信息", stringecordInformation);
    list.add(map1);
    }
    return list;
    }

    /**

  • 獲取備案信息
  • @param url
  • @return
  • @throws Exception
    */
    private static String getGecordInformation(String url) throws Exception {
    String htmlBody = CrawlData.downloadHtml(url);
    if (htmlBody != null) {
    Document doc = Jsoup.parse(htmlBody);
    String stringecordInformation = doc.select("li.TMain06List-Left>p").text();
    return stringecordInformation;
    }
    return null;
    }
    }

    package com.ok.http.client.okhttp;
    import com.alibaba.fastjson.JSONObject;
    import com.mongodb.MongoClient;
    import com.mongodb.client.MongoCollection;
    import com.mongodb.client.MongoDatabase;
    import org.bson.Document;
    import java.util.Map;

public class StoreData{

public static void adds(Map<String,Object> dataMap){
    try{
        // 連接到 mongodb 服務(wù)String siteRank
        MongoClient mongoClient = new MongoClient( "localhost" , 27017 );
        // 連接到數(shù)據(jù)庫(kù)
        MongoDatabase mongoDatabase = mongoClient.getDatabase("sit_rank");
        System.out.println(mongoDatabase);
        System.out.println("成功連接數(shù)據(jù)庫(kù)");

        MongoCollection<Document> collection = mongoDatabase.getCollection("information");
        System.out.println(collection);
        System.out.println("集合 information 選擇成功");
        //插入文檔
        /**
         * 1. 創(chuàng)建文檔 org.bson.Document 參數(shù)為key-value的格式
         * 2. 創(chuàng)建文檔集合List<Document>
         * 3. 將文檔集合插入數(shù)據(jù)庫(kù)集合中 mongoCollection.insertMany(List<Document>) 插入單個(gè)文檔可以用 mongoCollection.insertOne(Document)
         * */
        String siteName=null;String domainName=null;String AlexaRank=null;String Synopsis=null;
                String score=null;String siteRank=null;String webSite=null;String RecordInformation=null;
        JSONObject josn = JSONObject.parseObject(dataMap.toString());
                Document document = new Document(josn);
        document.put("_id",siteName);
        document.append("domainName", domainName);
        document.append("AlexaRank",AlexaRank);
        document.append("Synopsis",Synopsis);
        document.append("score",score);
        document.append("siteRank",siteRank);
        document.append("webSite",webSite);
        document.append("RecordInformation",RecordInformation);
        collection.insertOne(document);
        System.out.println("文檔插入成功");
        //關(guān)閉mongodb連接
        mongoClient.close();
        System.out.println("MongoDB連接已關(guān)閉");
    }catch(Exception e){
        System.err.println( e.getClass().getName() + ": " + e.getMessage() );
    }
}

}

向AI問(wèn)一下細(xì)節(jié)

免責(zé)聲明:本站發(fā)布的內(nèi)容(圖片、視頻和文字)以原創(chuàng)、轉(zhuǎn)載和分享為主,文章觀點(diǎn)不代表本網(wǎng)站立場(chǎng),如果涉及侵權(quán)請(qǐng)聯(lián)系站長(zhǎng)郵箱:is@yisu.com進(jìn)行舉報(bào),并提供相關(guān)證據(jù),一經(jīng)查實(shí),將立刻刪除涉嫌侵權(quán)內(nèi)容。

AI