您好,登錄后才能下訂單哦!
/**
* @desc:單線程爬蟲類
* @author [Lee] <[<complet@163.com>]>
* @property
* 1、callcontent 獲取給定url頁面中的內容的回調函數
* 2、calltodo 處理業務邏輯的回調函數 如:把抓取到的內容處理后存到數據庫
* @method
* run 執行爬蟲程序
* @param depth 深度 默認2
* @return void
*/
class crawl{
public $callcontent = 'getcontent'; # 獲取給定url頁面中的內容的回調函數
public $calltodo = 'todo'; # 處理業務邏輯的回調函數 如:把抓取到的內容處理后存到數據庫
private $url; # 內部屬性:當前處理中的url
/*
@desc:內部方法,調用回調函數獲取頁面內容
@param url 傳入到回調函數的參數
@return ret 頁面內容
*/
private function getcontent($url){
$callback = $this->callcontent;
$ret = call_user_func($callback,$url);
return $ret;
}
/*
@desc:內部方法,調用回調函數進行業務處理
@param content 傳入到回調函數的參數
*/
private function todo($content){
$callback = $this->calltodo;
call_user_func($callback,$content);
}
/*
@desc:內部方法,獲取頁面中的超鏈接
@param content 頁面內容
@return urls 獲取到的超鏈接
*/
private function geturl($content){
$preg = '/<[a|A].*?href=[\'\"]{0,1}([^>\'\"\ ]*).*?>/';
$bool = preg_match_all($preg,$content,$res);
$urls = array();
if($bool){
$urls = $res[1];
}
$urls = array_unique($urls);
return $urls;
}
/*
@desc:內部方法,修復不完整的url
@param url 原始url
@param url 修復好的url
*/
private function reviseurl($url){
$info = parse_url($url);
$scheme = $info["scheme"]?:'http';
$user = $info["user"];
$pass = $info["pass"];
$host = $info["host"];
$port = $info["port"];
$path = $info["path"];
$url = $scheme . '://';
if ($user && $pass) {
$url .= $user . ":" . $pass . "@";
}
$url .= $host;
if ($port) {
$url .= ":" . $port;
}
$url .= $path;
return $url;
}
/*
@desc:構造方法,初始化url
*/
public function __construct($url){
$this->url = $url;
}
/*
@desc:主方法,執行程序
@param depth 挖掘深度 默認2
*/
public function run($depth = 2){
$url = $this->url;
if($depth > 0){
$depth--;
$content = $this->getcontent($url);
// 業務處理開始
$this->todo($content);
// 業務處理結束
$urls = $this->geturl($content);
$url = $this->reviseurl($url);
if (is_array($urls) && !empty($urls)) {
foreach ($urls as $u) {
if (preg_match('/^http/', $u)) {
$returl = $u;
} else {
$real = $url . '/' . $u;
$returl = $real;
}
$crawl = new crawl($returl);
$crawl->run($depth);
}
}
}
}
}
$scrawl = new scrawl('https://blog.51cto.com/12173069');
$scrawl->run(1);
/*
@desc:獲取內容的回調
*/
function getcontent($url){
$content = file_get_contents($url);
return $content;
}
/*
@desc:處理業務邏輯的回調
*/
function todo($content){
$preg = '/<[a|A].*?href=[\'\"]{0,1}([^>\'\"\ ]*).*?>/i';
$bool = preg_match_all($preg,$content,$res);
$urls = array();
if($bool){
$urls = $res[1];
}
$urls = array_unique($urls);
var_dump($urls);
}
array(72) {
[0]=>
string(22) "https://blog.51cto.com/"
[2]=>
string(30) "https://blog.51cto.com/original"
[3]=>
string(34) "https://blog.51cto.com/cloumn/index"
[4]=>
string(28) "https://blog.51cto.com/expert"
[5]=>
string(35) "https://blog.51cto.com/blogger/index"
[6]=>
string(19) "javascript:void(0);"
[7]=>
string(20) "http://edu.51cto.com"
[8]=>
string(21) "https://blog.51cto.com"
[9]=>
string(21) "http://down.51cto.com"
[10]=>
string(21) "http://home.51cto.com"
[11]=>
string(20) "http://bbs.51cto.com"
[12]=>
string(18) "http://x.51cto.com"
[13]=>
string(0) ""
[14]=>
string(20) "http://wot.51cto.com"
[15]=>
string(20) "http://www.51cto.com"
[16]=>
string(89) "http://home.51cto.com/user/register?reback=http%253A%252F%252Fblog.51cto.com%252F12173069"
[17]=>
string(78) "https://blog.51cto.com/user/login?reback=http%3A%2F%2Fblog.51cto.com%2F12173069"
[18]=>
string(12) "javascript:;"
[19]=>
string(34) "https://blog.51cto.com/search/index"
[23]=>
string(40) "http://home.51cto.com/space?uid=12163069"
[27]=>
string(37) "https://blog.51cto.com/12173069?type=1"
[28]=>
string(37) "https://blog.51cto.com/12173069?type=2"
[29]=>
string(37) "https://blog.51cto.com/12173069?type=3"
[30]=>
string(30) "https://blog.51cto.com/12173069"
[37]=>
string(33) "https://blog.51cto.com/12173069?s="
[38]=>
string(34) "https://blog.51cto.com/12173069?s=3"
[39]=>
string(34) "https://blog.51cto.com/12173069?s=4"
[40]=>
string(34) "https://blog.51cto.com/12173069?s=5"
[41]=>
string(34) "https://blog.51cto.com/12173069?s=6"
[50]=>
string(38) "https://blog.51cto.com/12173069/2126752"
[55]=>
string(38) "https://blog.51cto.com/12173069/2126693"
[60]=>
string(38) "https://blog.51cto.com/12173069/2126661"
[65]=>
string(38) "https://blog.51cto.com/12173069/2126657"
[70]=>
string(38) "https://blog.51cto.com/12173069/2126596"
[75]=>
string(38) "https://blog.51cto.com/12173069/2126591"
[80]=>
string(38) "https://blog.51cto.com/12173069/2126496"
[85]=>
string(38) "https://blog.51cto.com/12173069/2126420"
[90]=>
string(38) "https://blog.51cto.com/12173069/2126324"
[95]=>
string(38) "https://blog.51cto.com/12173069/2126210"
[100]=>
string(38) "https://blog.51cto.com/12173069/2126090"
[105]=>
string(38) "https://blog.51cto.com/12173069/2125724"
[110]=>
string(38) "https://blog.51cto.com/12173069/2125666"
[115]=>
string(38) "https://blog.51cto.com/12173069/2125424"
[120]=>
string(38) "https://blog.51cto.com/12173069/2125359"
[125]=>
string(38) "https://blog.51cto.com/12173069/2124937"
[130]=>
string(38) "https://blog.51cto.com/12173069/2124923"
[135]=>
string(38) "https://blog.51cto.com/12173069/2124720"
[140]=>
string(38) "https://blog.51cto.com/12173069/2124693"
[145]=>
string(38) "https://blog.51cto.com/12173069/2124499"
[147]=>
string(33) "https://blog.51cto.com/12173069/p1"
[148]=>
string(33) "https://blog.51cto.com/12173069/p2"
[149]=>
string(33) "https://blog.51cto.com/12173069/p3"
[150]=>
string(33) "https://blog.51cto.com/12173069/p4"
[151]=>
string(33) "https://blog.51cto.com/12173069/p5"
[152]=>
string(33) "https://blog.51cto.com/12173069/p6"
[153]=>
string(33) "https://blog.51cto.com/12173069/p7"
[154]=>
string(33) "https://blog.51cto.com/12173069/p8"
[156]=>
string(34) "https://blog.51cto.com/12173069/p19"
[159]=>
string(39) "https://blog.51cto.com/ityouknow/2124403"
[160]=>
string(35) "https://blog.51cto.com/wyait/2125708"
[161]=>
string(39) "https://blog.51cto.com/lumay0526/2124116"
[162]=>
string(38) "https://blog.51cto.com/11010461/2123639"
[163]=>
string(35) "https://blog.51cto.com/qiuyt/2124456"
[164]=>
string(30) "https://blog.51cto.com/13716231"
[166]=>
string(30) "https://blog.51cto.com/13108471"
[168]=>
string(30) "https://blog.51cto.com/10316297"
[170]=>
string(30) "https://blog.51cto.com/13718637"
[172]=>
string(30) "https://blog.51cto.com/13681316"
[174]=>
string(20) "http://www.51CTO.com"
[175]=>
string(37) "https://blog.51cto.com/blogger/publish"
[176]=>
string(71) "http://wpa.qq.com/msgrd?v=3&uin=3591348659&site=qq&menu=yes"
[177]=>
string(39) "https://blog.51cto.com/51ctoblog/2057444"
}
免責聲明:本站發布的內容(圖片、視頻和文字)以原創、轉載和分享為主,文章觀點不代表本網站立場,如果涉及侵權請聯系站長郵箱:is@yisu.com進行舉報,并提供相關證據,一經查實,將立刻刪除涉嫌侵權內容。