| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239 | <?phpnamespace App\Http\Controllers\Admin;use App\Http\Controllers\Controller;use App\Http\Models\Site;use GuzzleHttp\Client;use GuzzleHttp\Pool;use Illuminate\Http\Request;use Illuminate\Support\Facades\DB;use Illuminate\Support\Facades\Log;/** * 爬站点所有url-http状态 * Class ArticleController * @package App\Http\Controllers\Admin */class ReptileController extends Controller{    /**     * 爬虫脚本     * @param Request $request     */    public function index(Request $request)    {        DB::table('site_status')->delete();        $list = Site::query()->whereIn('status', [2, 3])->select('webmaster_domain')->limit(10)->get();        foreach ($list as $items) {            $url = substr(trim($items->webmaster_domain), -1);            if ($url == '/') {                Log::info('webmaster_domain:' . $items->webmaster_domain);                $testUrl = substr(trim($items->webmaster_domain), 0, -1);                $siteContent = $this->getUrlContent($testUrl);                if (!empty($siteContent)) {                    $urlList = $this->crawler($testUrl, $siteContent);                    $list = [];                    foreach ($urlList as $item) {                        $url = substr($item, 0, strlen($testUrl));                        if (empty(strcmp($url, $testUrl))) {                            $list[] = $item;                        }                    }                    $list = array_unique($list);                    $list1 = $this->filter($list, $testUrl);                    $list2 = $this->filter($list1, $testUrl);                    Log::info('webmaster_domain1:' . count($list));                    $this->getAllHeaderResponse($list2);                }            }        }        die('success');    }    /**     * 读取网站内容并筛选出相同域名下的连接列表     * @param $result     * @param $testUrl     * @return array     */    private function filter($result, $testUrl)    {        $list = [];        foreach ($result as $item) {            $siteContent = $this->getUrlContent($item);            if (!empty($siteContent)) {                $urlList1 = $this->crawler($item, $siteContent);                if (!empty($urlList1)) {                    foreach ($urlList1 as $value) {                        $url = substr($value, 0, strlen($testUrl));                        if (empty(strcmp($url, $testUrl))) {                            $list[] = $value;                        }                    }                    $list = array_unique($list);                }            }        }        return $list;    }    /**     * 获取网站http相应状态     * @param $result     */    private function getAllHeaderResponse($result)    {        //重置索引        $result = array_merge($result);        $count = count($result) ?? 0;        $res = $this->multiCheckNetResource($result);        $data = [];        foreach ($result as $k => $v) {            foreach ($res as $kk => $vv) {                if ($k == $kk) {                    $data[] = [                        'url' => $v,                        'status' => $vv,                        'count' => $count,                    ];                }            }        }        DB::table('site_status')->insert($data);    }    /**     * 获取网站内容     * @param $url     * @return bool|false|string     */    private function getUrlContent($url)    {        try {            $handle = file_get_contents($url);            return $handle;        } catch (\Throwable $exception) {            return false;        }    }    /**     * 获取网站内容链接     * @param $url     * @param string $content     * @return array|bool     */    private function crawler($url, $content = '')    {        $urlList = $this->reviseUrl($url, $this->filterUrl($content));        if ($urlList) {            return $urlList;        } else {            return false;        }    }    /**     * 正则域名     * @param $webContent     * @return bool|mixed     */    private function filterUrl($webContent)    {        $reg = '/<[a|A].*?href=[\'\"]{0,1}([^>\'\"\ ]*).*?>/';        $result = preg_match_all($reg, $webContent, $matchResult);        if ($result) {            return $matchResult[1];        } else {            return false;        }    }    /**     * 获取域名下面的所有子uri     * @param $baseUrl     * @param $urlList     * @return array|bool     */    private function reviseUrl($baseUrl, $urlList)    {        $urlInfo = parse_url($baseUrl);        $baseUrl = $urlInfo["scheme"] . '://' . $urlInfo["host"];        $result = [];        if (is_array($urlList)) {            foreach ($urlList as $urlItem) {                if (preg_match('/^http/', $urlItem)) {                    // 已经是完整的url                    $result[] = $urlItem;                } else {                    // 不完整的url                    if (substr($urlItem, 0, 1) == '/') {                        $realUrl = $baseUrl . $urlItem;                    } else {                        $realUrl = $baseUrl . '/' . $urlItem;                    }                    $result[] = $realUrl;                }            }            return $result;        } else {            return false;        }    }    /**     * 并发多请求 检查网络资源是否200     * @param $taskUrls     * @param int $concurrency     * @param array $config     * @return array     */    private static function multiCheckNetResource(        $taskUrls,        $concurrency = 5,        $config = [            'verify' => false,            'timeout' => 3,        ]    )    {        $client = new Client($config); //并发请求链接地址        $requests = function () use ($client, $taskUrls) {            foreach ($taskUrls as $item) {                yield new \GuzzleHttp\Psr7\Request('HEAD', $item);            }        };        $result = [];        $pool = new Pool($client, $requests(), [            'concurrency' => $concurrency, //同时并发抓取几个            'fulfilled' => function (\GuzzleHttp\Psr7\Response $response, $index) use (&$result) {                // this is delivered each successful response                $result[$index] = $response->getStatusCode();            },            'rejected' => function (\Throwable $throwable, $index) use (&$result) {                $result[$index] = $throwable->getCode();                // this is delivered each failed request            },        ]);        $promise = $pool->promise();        $promise->wait();        return $result;    }}
 |