【php】多进程瞎折腾-写个网站死链检测脚本

一个网站几万个页面,手动找死链不是找死吗?搞个脚本多开几个进程多好啊。

运行前注意了: 要在linux下运行,还要有redis,还要有phppoxis,redis
pnctl, curl这些扩展,还要有composer

使用

* 下载  `git clone  https://github.com/wuchuhengtools/checkdeadUrl.git`.
* 切换到项目目录,安装依赖`composer install`.
* 启动`php  Crawler.php`.

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
<?php 
/**
* 网站死链检测程序
*
* @filename Crawler.php
* @author wuchuehng<wuchuheng@163.com>
* @date 2019/07/26
*/
declare(strict_types=1); //强类型模式
if(strpos(strtolower(PHP_OS), 'win') === 0) exit("not support windows, please be run on Linux\n");
if(!extension_loaded('pcntl')) exit("Please install pcntl extension.\n");
if (substr(php_sapi_name(), 0, 3) !== 'cli') die("This Programe can only be run in CLI mode");
if(!extension_loaded('Redis')) exit("Please install Redis extension.\n");
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
require_once "./vendor/autoload.php";
class Crawler
{
public static $count = 10; //进程量
public static $domain= '<要检测的网址>'; //网站主页
private static $Redis;
private static $redis_pass = '';
private static $redis_host = '127.0.0.1';
private static $redis_port = 6379;
/**
* 获取redis连接实例
*
* @return redis连接对象
*/
private static function getRedisInstance() : object
{
if (!is_object(self::$Redis)) {
$Redis = new \Redis();
$Redis->connect(self::$redis_host, self::$redis_port);
self::$Redis = $Redis;
}
return self::$Redis;
}
/**
* 检测页面是否有死链并入队新的url
*
* @url
*/
private static function checkUrl(string $url)
{
$current_url = $url;
$Redis = self::getRedisInstance();
if($Redis->hExists('beCrawler', $url)) return;
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 5);
$html = curl_exec($ch);
curl_close($ch);
$crawler = new DomCrawler($html);
$urls = $crawler->filterXPath('//a/@href')
->each(function (DomCrawler $node, $i ) {
return $node->text();
});
foreach($urls as $url) {
$purl = parse_url($url);
$pdurl = parse_url(self::$domain);
if(!array_key_exists('host', $purl)) {
$url = self::$domain . $url;
}
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 1);
curl_setopt($ch, CURLOPT_NOBODY, 1);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
curl_setopt($ch, CURLOPT_AUTOREFERER, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_exec($ch);
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
if ($code === 200 && !$Redis->hExists('beCrawler', $url)) {
//加入队列
$Redis->lPush(
'waitting_queue',
json_encode([
'url'=>$url,
'micotime'=>microtime(true),
'pid'=>posix_getpid()])
);
} elseif($code !== 200) {
$deal_url = [];
if ($Redis->hExists('dealUrl', $current_url)) {
$deal_url = json_decode(
$Redis->hGet('dealUrl', $current_url)
);
}
$deal_url[] = $url;
$deal_url = json_encode(array_filter($deal_url));
$Redis->hset('dealUrl', $current_url, $deal_url);
}
}
$Redis->hset('beCrawler', $current_url, json_encode(['micotime'=>microtime(true),'pid'=>posix_getpid()]));
}
/**
* 启动所有进程
*
*
*/
public static function runAll()
{
for ($i = 0; $i < self::$count; $i ++) {
$pid = pcntl_fork();
if ($pid === -1) {
exit("fork progresses error\n");
} else if($pid == 0) {
cli_set_process_title('subprocess PID:' . posix_getpid());
$Redis = self::getRedisInstance();
if ($Redis->lLen('waitting_queue') === 0 && !$Redis->hExists('beCrawler', self::$domain)){
self::checkUrl(self::$domain);
}
while($Redis->lLen('waitting_queue') > 0 )
{
$url = json_decode($Redis->lPop('waitting_queue'), true);
$url = $url['url'];
self::checkUrl($url);
}
exit(0); //中断子进程重复fork
} else {
// ...
}
}
cli_set_process_title('main Crawler');
//主进程
$pid = pcntl_wait($status, WUNTRACED); //取得子进程结束状态
if (pcntl_wifexited($status)) {
if ($Redis->lLen('waitting_queue') !== 0) {
//补充意外死掉的进程
self::$count = 1;
self::runAll();
}
echo "\n\n* Sub process: {$pid} exited with {$status}";
}
}
}

Crawler::runAll();
坚持原创技术分享,您的支持将鼓励我继续创作!
0%