您好,登錄后才能下訂單哦!
<?php
namespace dollarphp;
/**
* @desc:多線程爬蟲類
* @author [Lee] <[<complet@163.com>]>
* @property
* 1、calltrigger 觸發(fā)爬蟲程序的回調(diào)函數(shù)
* 2、calltodo 處理業(yè)務(wù)邏輯的回調(diào)函數(shù) 如:把抓取到的內(nèi)容處理后存到數(shù)據(jù)庫(kù)
* 3、timeout 超時(shí)時(shí)間,默認(rèn)5秒
* 4、depth 重定向深度,默認(rèn)3
* 5、name 上傳文件的名字,默認(rèn)file
* 6、cookie 模擬登錄時(shí)cookie存儲(chǔ)在本地的文件,默認(rèn)cookie_n.txt
* @method
* 1、ssl 是否設(shè)置https true:是 false:否
* 2、auth 啟用驗(yàn)證 user:用戶名 pass:密碼
* 3、login 模擬登錄,獲取cookie
* 4、cookie 使用cookie登錄
* 5、header 設(shè)置請(qǐng)求頭 data:請(qǐng)求頭數(shù)組
* 6、proxy 設(shè)置服務(wù)器代理 url:代理服務(wù)器url port:代理服務(wù)器端口
* 7、agent 設(shè)置瀏覽器代理 browse:代理瀏覽器 默認(rèn):Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)
* 8、get 模擬get請(qǐng)求 data:傳遞的數(shù)據(jù)
* 9、post 模擬post請(qǐng)求 data:傳遞的數(shù)據(jù)
* 10、json 模擬json請(qǐng)求 data:傳遞的數(shù)據(jù)
* 11、upload 模擬表單上傳 files:上傳的文件 array|string
* 12、download 下載文件 dir:要下載的文件 格式:a/b
* 13、run 執(zhí)行 depth:深度
*/
class crawl{
public $calltrigger = 'trigger'; # 觸發(fā)爬蟲程序的回調(diào)函數(shù)
public $calltodo = 'todo'; # 處理業(yè)務(wù)邏輯的回調(diào)函數(shù)
public $timeout = 5; # 超時(shí)時(shí)間,默認(rèn)5秒
public $depth = 3; # 重定向深度,默認(rèn)3
public $name = 'file'; # 上傳文件的名字,默認(rèn)file
public $cookie = 'cookie.txt'; # 模擬登錄時(shí)cookie存儲(chǔ)在本地的文件,默認(rèn)cookie_n
private $schemes = array();
private $hosts = array();
private $paths = array();
private $querys = array();
private $options = array();
private $chs;
private $fps;
private $handle;
private $urls = array();
/*
@desc:內(nèi)部方法,獲取頁(yè)面中的超鏈接
@param content 頁(yè)面內(nèi)容
@return urls 獲取到的超鏈接
*/
private function geturl($content){
$preg = '/<[a|A].*?href=[\'\"]{0,1}([^>\'\"\ ]*).*?>/i';
$bool = preg_match_all($preg,$content,$res);
$urls = array();
if($bool){
$urls = $res[1];
}
$urls = array_unique($urls);
return $urls;
}
/*
@desc:內(nèi)部方法,修復(fù)不完整的url
@param url 原始url
@param url 修復(fù)好的url
*/
private function reviseurl($url){
$info = parse_url($url);
$scheme = $info["scheme"]?:'http';
$user = $info["user"];
$pass = $info["pass"];
$host = $info["host"];
$port = $info["port"];
$path = $info["path"];
$url = $scheme . '://';
if ($user && $pass) {
$url .= $user . ":" . $pass . "@";
}
$url .= $host;
if ($port) {
$url .= ":" . $port;
}
$url .= $path;
return $url;
}
/*
@desc:內(nèi)部方法,調(diào)用回調(diào)函數(shù)進(jìn)行業(yè)務(wù)處理
@param content 傳入到回調(diào)函數(shù)的參數(shù)
*/
private function todo($content){
$calltodo = $this->calltodo;
call_user_func($calltodo,$content);
}
/*
@desc:觸發(fā)爬蟲程序的回調(diào)函數(shù)
@param urls 待處理的url數(shù)組
@param depth 處理深度
*/
private function trigger($urls,$depth){
$calltrigger = $this->calltrigger;
call_user_func($calltrigger,$urls,$depth);
}
/*
@desc:內(nèi)部方法 設(shè)置get請(qǐng)求參數(shù)
@param data 請(qǐng)求數(shù)據(jù)
*/
private function setget($data){
$schemes = $this->schemes;
$hosts = $this->hosts;
$paths = $this->paths;
$querys = $this->querys;
foreach($this->chs as $k=>$v){
$sep = ($querys[$k] || !empty($data))?"?":"";
$qurl = $schemes[$k].'://'.$hosts[$k].$paths[$k].$sep.$querys[$k].$data;
$this->options[$k][CURLOPT_URL] = $qurl;
}
return $this;
}
/*
@desc:內(nèi)部方法 設(shè)置post請(qǐng)求參數(shù)
@param data 請(qǐng)求數(shù)據(jù)
*/
private function setpost($data){
$schemes = $this->schemes;
$hosts = $this->hosts;
$paths = $this->paths;
$querys = $this->querys;
foreach($this->chs as $k=>$v){
$sep = $query?"?":"";
$qurl = $schemes[$k].'://'.$hosts[$k].$paths[$k].$sep.$querys[$k];
$this->options[$k][CURLOPT_URL] = $qurl;
$this->options[$k][CURLOPT_POST] = 1;
$this->options[$k][CURLOPT_POSTFIELDS] = $data;
}
return $this;
}
/*
@desc:內(nèi)部方法 設(shè)置最終請(qǐng)求參數(shù)
*/
private function setopt(){
$options = $this->options;
foreach($options as $k=>$v){
curl_setopt_array(
$this->chs[$k],
$v
);
}
return $this;
}
/*
@desc:構(gòu)造方法 設(shè)置初始請(qǐng)求參數(shù)
@param urls 請(qǐng)求地址數(shù)組
*/
public function __construct($urls){
$this->urls = $urls;
$this->handle = curl_multi_init();
foreach($urls as $k=>$v){
$info = parse_url($v);
$this->schemes[$k] = $info['scheme']?:'http';
$this->hosts[$k] = $info['host'];
$this->paths[$k] = $info['path'];
$this->querys[$k] = $info['query'];
$this->chs[$k] = curl_init();
$this->options[$k][CURLOPT_CONNECTTIMEOUT] = $this->timeout;
$this->options[$k][CURLOPT_RETURNTRANSFER] = 1;
$this->options[$k][CURLOPT_FOLLOWLOCATION] = 1;
$this->options[$k][CURLINFO_HEADER_OUT] = true;
$this->options[$k][CURLOPT_ENCODING] = 'gzip';
$this->options[$k][CURLOPT_MAXREDIRS] = $this->depth;
curl_multi_add_handle ($this->handle,$this->chs[$k]);
}
}
/*
@desc:是否設(shè)置https請(qǐng)求
@param bool true:https請(qǐng)求 false:http請(qǐng)求
*/
public function ssl($bool = false){
if($bool){
foreach($this->chs as $k=>$v){
$this->scheme[$k] = 'https';
$this->options[$k][CURLOPT_SSL_VERIFYHOST] = 1;
$this->options[$k][CURLOPT_SSL_VERIFYPEER] = false;
}
}
return $this;
}
/*
@desc:設(shè)置驗(yàn)證用戶名、密碼
@param user 用戶名
@param pass 密碼
*/
public function auth($user,$pass){
foreach($this->chs as $k=>$v){
$this->options[$k][CURLOPT_USERPWD] = $user.':'.$pass;
}
return $this;
}
/*
@desc:模擬登錄
*/
public function login(){
$cookie = $this->cookie;
$arr = explode('.',$cookie);
$name = $arr[0];
$ext = $arr[1];
foreach($this->chs as $k=>$v){
$this->options[$k][CURLOPT_COOKIEJAR] = $name.'_'.$k.'.'.$ext;
$this->options[$k][CURLOPT_RETURNTRANSFER] = 0;
}
return $this;
}
/*
@desc:帶cookie登錄
*/
public function cookie(){
$cookie = $this->cookie;
$arr = explode('.',$cookie);
$name = $arr[0];
$ext = $arr[1];
foreach($this->chs as $k=>$v){
$this->options[$k][CURLOPT_COOKIEFILE] = $name.'_'.$k.'.'.$ext;
}
return $this;
}
/*
@desc:設(shè)置請(qǐng)求頭信息
@param data 請(qǐng)求頭
*/
public function header($data){
foreach($this->chs as $k=>$v){
$this->options[$k][CURLOPT_HTTPHEADER] = $this->options[$k][CURLOPT_HTTPHEADER]?:array();
$this->options[$k][CURLOPT_HTTPHEADER] = array_merge($this->options[$k][CURLOPT_HTTPHEADER],$data);
}
return $this;
}
/*
@desc:設(shè)置代理服務(wù)器
@param url 代理服務(wù)器url
@param port 代理服務(wù)器端口
*/
public function proxy($url,$port){
$info = parse_url($url);
$scheme = $info['scheme']?:'http';
$host = $info['host'];
$path = $info['path'];
$purl = $scheme.'://'.$host.$path.':'.$port;
foreach($this->chs as $k=>$v){
$this->options[$k][CURLOPT_PROXY] = $purl;
}
return $this;
}
/*
@desc:設(shè)置代理瀏覽器
@param browse 代理瀏覽器
*/
public function agent($browse = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)'){
foreach($this->chs as $k=>$v){
$this->options[$k][CURLOPT_USERAGENT] = $browse;
}
return $this;
}
/*
@desc:模擬get請(qǐng)求
@param data 請(qǐng)求數(shù)據(jù)
*/
public function get($data = array()){
$data = http_build_query($data);
$this->setget($data);
return $this;
}
/*
@desc:模擬post請(qǐng)求
@param data 請(qǐng)求數(shù)據(jù)
*/
public function post($data = array()){
$this->setpost($data);
return $this;
}
/*
@desc:模擬json請(qǐng)求
@param data 請(qǐng)求數(shù)據(jù)
*/
public function json($data = array()){
$data = json_encode($data);
$header = array(
'Content-Type: application/json',
'Content-Length:' . strlen($data)
);
$this->header($header);
$this->setpost($data);
return $this;
}
/*
@desc:模擬表單上傳
@param files 文件路徑
*/
public function upload($files){
$data = array();
$name = $this->name;
if(is_array($files)){
foreach($files as $k=>$v){
$data["{$name}[{$k}]"]=new \CURLFile($v);
}
}else{
$data["{$name}"]=new \CURLFile($files);
}
$this->setpost($data);
return $this;
}
/*
@desc:下載文件
@param dir 存儲(chǔ)文件目錄
*/
public function download($dir = ''){
$paths = $this->paths;
if($dir && !is_dir($dir)){
mkdir($dir,0755,true);
}
foreach($this->paths as $k=>$v){
$name = strrchr($v, '/');
$dsep = $dir?'/':'';
$this->fps[$k]=fopen('.'.$dsep.$dir.$name, 'w');
$this->options[$k][CURLOPT_FILE] = $this->fps[$k];
}
$this->setget('');
return $this;
}
/*
@desc:執(zhí)行方法
@param depth 深度 默認(rèn)2
*/
public function run($depth = 2){
$this->setopt();
$chs = $this->chs;
$handle = $this->handle;
$urls = $this->urls;
if($depth > 0){
$depth--;
$active = null;
$mrc = curl_multi_exec($handle, $active);
while ($mrc == CURLM_CALL_MULTI_PERFORM) {
$mrc = curl_multi_exec($handle, $active);
}
while ($active && $mrc == CURLM_OK) {
if (curl_multi_select($handle) != -1) {
usleep(100);
}
$mrc = curl_multi_exec($handle, $active);
while ($mrc == CURLM_CALL_MULTI_PERFORM) {
$mrc = curl_multi_exec($handle, $active);
}
}
foreach ($chs as $k => $v) {
if (curl_error($chs[$k]) == "") {
$content = curl_multi_getcontent($chs[$k]);
$this->todo($content);
$aurls = $this->geturl($content);
$urls[$k] = $this->reviseurl($urls[$k]);
if (is_array($aurls) && !empty($aurls)) {
foreach ($aurls as $k1=>$u) {
if (preg_match('/^http/', $u)) {
$returl[$k1] = $u;
} else {
$real = $urls[$k] . '/' . $u;
$returl[$k1] = $real;
}
}
$this->trigger($returl,$depth);
}
}
curl_multi_remove_handle($handle, $chs[$k]);
curl_close($chs[$k]);
}
curl_multi_close($handle);
}
}
}
function todo($content){
echo 'ok'.PHP_EOL;
}
$urls=array(
'www.baidu.com',
'www.taobao.com'
);
function trigger($urls = array(),$depth = 2){
$crawl = new crawl($urls);
$crawl->get()->run($depth);
}
trigger($urls);
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
免責(zé)聲明:本站發(fā)布的內(nèi)容(圖片、視頻和文字)以原創(chuàng)、轉(zhuǎn)載和分享為主,文章觀點(diǎn)不代表本網(wǎng)站立場(chǎng),如果涉及侵權(quán)請(qǐng)聯(lián)系站長(zhǎng)郵箱:is@yisu.com進(jìn)行舉報(bào),并提供相關(guān)證據(jù),一經(jīng)查實(shí),將立刻刪除涉嫌侵權(quán)內(nèi)容。