delete(); $list = Site::query()->whereIn('status', [2, 3])->select('webmaster_domain')->limit(10)->get(); foreach ($list as $items) { $url = substr(trim($items->webmaster_domain), -1); if ($url == '/') { Log::info('webmaster_domain:' . $items->webmaster_domain); $testUrl = substr(trim($items->webmaster_domain), 0, -1); $siteContent = $this->getUrlContent($testUrl); if (!empty($siteContent)) { $urlList = $this->crawler($testUrl, $siteContent); $list = []; foreach ($urlList as $item) { $url = substr($item, 0, strlen($testUrl)); if (empty(strcmp($url, $testUrl))) { $list[] = $item; } } $list = array_unique($list); $list1 = $this->filter($list, $testUrl); $list2 = $this->filter($list1, $testUrl); Log::info('webmaster_domain1:' . count($list)); $this->getAllHeaderResponse($list2); } } } die('success'); } /** * 读取网站内容并筛选出相同域名下的连接列表 * @param $result * @param $testUrl * @return array */ private function filter($result, $testUrl) { $list = []; foreach ($result as $item) { $siteContent = $this->getUrlContent($item); if (!empty($siteContent)) { $urlList1 = $this->crawler($item, $siteContent); if (!empty($urlList1)) { foreach ($urlList1 as $value) { $url = substr($value, 0, strlen($testUrl)); if (empty(strcmp($url, $testUrl))) { $list[] = $value; } } $list = array_unique($list); } } } return $list; } /** * 获取网站http相应状态 * @param $result */ private function getAllHeaderResponse($result) { //重置索引 $result = array_merge($result); $count = count($result) ?? 0; $res = $this->multiCheckNetResource($result); $data = []; foreach ($result as $k => $v) { foreach ($res as $kk => $vv) { if ($k == $kk) { $data[] = [ 'url' => $v, 'status' => $vv, 'count' => $count, ]; } } } DB::table('site_status')->insert($data); } /** * 获取网站内容 * @param $url * @return bool|false|string */ private function getUrlContent($url) { try { $handle = file_get_contents($url); return $handle; } catch (\Throwable $exception) { return false; } } /** * 获取网站内容链接 * @param $url * @param string $content * @return array|bool */ private function crawler($url, $content = '') { $urlList = $this->reviseUrl($url, $this->filterUrl($content)); if ($urlList) { return $urlList; } else { return false; } } /** * 正则域名 * @param $webContent * @return bool|mixed */ private function filterUrl($webContent) { $reg = '/<[a|A].*?href=[\'\"]{0,1}([^>\'\"\ ]*).*?>/'; $result = preg_match_all($reg, $webContent, $matchResult); if ($result) { return $matchResult[1]; } else { return false; } } /** * 获取域名下面的所有子uri * @param $baseUrl * @param $urlList * @return array|bool */ private function reviseUrl($baseUrl, $urlList) { $urlInfo = parse_url($baseUrl); $baseUrl = $urlInfo["scheme"] . '://' . $urlInfo["host"]; $result = []; if (is_array($urlList)) { foreach ($urlList as $urlItem) { if (preg_match('/^http/', $urlItem)) { // 已经是完整的url $result[] = $urlItem; } else { // 不完整的url if (substr($urlItem, 0, 1) == '/') { $realUrl = $baseUrl . $urlItem; } else { $realUrl = $baseUrl . '/' . $urlItem; } $result[] = $realUrl; } } return $result; } else { return false; } } /** * 并发多请求 检查网络资源是否200 * @param $taskUrls * @param int $concurrency * @param array $config * @return array */ private static function multiCheckNetResource( $taskUrls, $concurrency = 5, $config = [ 'verify' => false, 'timeout' => 3, ] ) { $client = new Client($config); //并发请求链接地址 $requests = function () use ($client, $taskUrls) { foreach ($taskUrls as $item) { yield new \GuzzleHttp\Psr7\Request('HEAD', $item); } }; $result = []; $pool = new Pool($client, $requests(), [ 'concurrency' => $concurrency, //同时并发抓取几个 'fulfilled' => function (\GuzzleHttp\Psr7\Response $response, $index) use (&$result) { // this is delivered each successful response $result[$index] = $response->getStatusCode(); }, 'rejected' => function (\Throwable $throwable, $index) use (&$result) { $result[$index] = $throwable->getCode(); // this is delivered each failed request }, ]); $promise = $pool->promise(); $promise->wait(); return $result; } }