123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239 |
- <?php
- namespace App\Http\Controllers\Admin;
- use App\Http\Controllers\Controller;
- use App\Http\Models\Site;
- use GuzzleHttp\Client;
- use GuzzleHttp\Pool;
- use Illuminate\Http\Request;
- use Illuminate\Support\Facades\DB;
- use Illuminate\Support\Facades\Log;
- /**
- * 爬站点所有url-http状态
- * Class ArticleController
- * @package App\Http\Controllers\Admin
- */
- class ReptileController extends Controller
- {
- /**
- * 爬虫脚本
- * @param Request $request
- */
- public function index(Request $request)
- {
- DB::table('site_status')->delete();
- $list = Site::query()->whereIn('status', [2, 3])->select('webmaster_domain')->limit(10)->get();
- foreach ($list as $items) {
- $url = substr(trim($items->webmaster_domain), -1);
- if ($url == '/') {
- Log::info('webmaster_domain:' . $items->webmaster_domain);
- $testUrl = substr(trim($items->webmaster_domain), 0, -1);
- $siteContent = $this->getUrlContent($testUrl);
- if (!empty($siteContent)) {
- $urlList = $this->crawler($testUrl, $siteContent);
- $list = [];
- foreach ($urlList as $item) {
- $url = substr($item, 0, strlen($testUrl));
- if (empty(strcmp($url, $testUrl))) {
- $list[] = $item;
- }
- }
- $list = array_unique($list);
- $list1 = $this->filter($list, $testUrl);
- $list2 = $this->filter($list1, $testUrl);
- Log::info('webmaster_domain1:' . count($list));
- $this->getAllHeaderResponse($list2);
- }
- }
- }
- die('success');
- }
- /**
- * 读取网站内容并筛选出相同域名下的连接列表
- * @param $result
- * @param $testUrl
- * @return array
- */
- private function filter($result, $testUrl)
- {
- $list = [];
- foreach ($result as $item) {
- $siteContent = $this->getUrlContent($item);
- if (!empty($siteContent)) {
- $urlList1 = $this->crawler($item, $siteContent);
- if (!empty($urlList1)) {
- foreach ($urlList1 as $value) {
- $url = substr($value, 0, strlen($testUrl));
- if (empty(strcmp($url, $testUrl))) {
- $list[] = $value;
- }
- }
- $list = array_unique($list);
- }
- }
- }
- return $list;
- }
- /**
- * 获取网站http相应状态
- * @param $result
- */
- private function getAllHeaderResponse($result)
- {
- //重置索引
- $result = array_merge($result);
- $count = count($result) ?? 0;
- $res = $this->multiCheckNetResource($result);
- $data = [];
- foreach ($result as $k => $v) {
- foreach ($res as $kk => $vv) {
- if ($k == $kk) {
- $data[] = [
- 'url' => $v,
- 'status' => $vv,
- 'count' => $count,
- ];
- }
- }
- }
- DB::table('site_status')->insert($data);
- }
- /**
- * 获取网站内容
- * @param $url
- * @return bool|false|string
- */
- private function getUrlContent($url)
- {
- try {
- $handle = file_get_contents($url);
- return $handle;
- } catch (\Throwable $exception) {
- return false;
- }
- }
- /**
- * 获取网站内容链接
- * @param $url
- * @param string $content
- * @return array|bool
- */
- private function crawler($url, $content = '')
- {
- $urlList = $this->reviseUrl($url, $this->filterUrl($content));
- if ($urlList) {
- return $urlList;
- } else {
- return false;
- }
- }
- /**
- * 正则域名
- * @param $webContent
- * @return bool|mixed
- */
- private function filterUrl($webContent)
- {
- $reg = '/<[a|A].*?href=[\'\"]{0,1}([^>\'\"\ ]*).*?>/';
- $result = preg_match_all($reg, $webContent, $matchResult);
- if ($result) {
- return $matchResult[1];
- } else {
- return false;
- }
- }
- /**
- * 获取域名下面的所有子uri
- * @param $baseUrl
- * @param $urlList
- * @return array|bool
- */
- private function reviseUrl($baseUrl, $urlList)
- {
- $urlInfo = parse_url($baseUrl);
- $baseUrl = $urlInfo["scheme"] . '://' . $urlInfo["host"];
- $result = [];
- if (is_array($urlList)) {
- foreach ($urlList as $urlItem) {
- if (preg_match('/^http/', $urlItem)) {
- // 已经是完整的url
- $result[] = $urlItem;
- } else {
- // 不完整的url
- if (substr($urlItem, 0, 1) == '/') {
- $realUrl = $baseUrl . $urlItem;
- } else {
- $realUrl = $baseUrl . '/' . $urlItem;
- }
- $result[] = $realUrl;
- }
- }
- return $result;
- } else {
- return false;
- }
- }
- /**
- * 并发多请求 检查网络资源是否200
- * @param $taskUrls
- * @param int $concurrency
- * @param array $config
- * @return array
- */
- private static function multiCheckNetResource(
- $taskUrls,
- $concurrency = 5,
- $config = [
- 'verify' => false,
- 'timeout' => 3,
- ]
- )
- {
- $client = new Client($config); //并发请求链接地址
- $requests = function () use ($client, $taskUrls) {
- foreach ($taskUrls as $item) {
- yield new \GuzzleHttp\Psr7\Request('HEAD', $item);
- }
- };
- $result = [];
- $pool = new Pool($client, $requests(), [
- 'concurrency' => $concurrency, //同时并发抓取几个
- 'fulfilled' => function (\GuzzleHttp\Psr7\Response $response, $index) use (&$result) {
- // this is delivered each successful response
- $result[$index] = $response->getStatusCode();
- },
- 'rejected' => function (\Throwable $throwable, $index) use (&$result) {
- $result[$index] = $throwable->getCode();
- // this is delivered each failed request
- },
- ]);
- $promise = $pool->promise();
- $promise->wait();
- return $result;
- }
- }
|