话不多说,因为近期需要做一个项目采集的工作,因此通过composer了解到
symfony/dom-crawler
symfony/css-selector
这两个配套使用
抓取dom非常的不错,因此今天有幸来使用,并记录下来使用过程
好下面先看看如何使用吧
帮助文档:https://symfony.com/doc/current/components/dom_crawler.html
github:
我使用的是thinkphp5.0的版本 php5.4 那么我引入 dom-crawler 的时候需要注意版本,
下面看看composer中如何配置
第一步:
composer加载 dom-crawler 和 css-selector 两个资源库
"require": { "symfony/dom-crawler":"v2.8.33", "symfony/css-selector":"v2.8.33" },
"symfony/dom-crawler":"v2.8.33", "symfony/css-selector":"v2.8.33" 这两项是我引入的资源 composer update 通过这个命令把文件包下载到本地
第二步:实例,并使用
require 'vendor/autoload.php'; $crawler=new \Symfony\Component\DomCrawler\Crawler(); $html=<<<EOT <!DOCTYPE html> <html> <body> <p class="message"><b>Hello</b> World!</p> <p>Hello Crawler!</p> <p>last p!</p> <b>bold</b> <div>div box</div> <div id='test'> <span>span标签</span> </div> <div id='second' class="news"> <span>second 下span标签</span> </div> <div id='article-1' class="news"> article-1内容 </div> <div id='article-2'> article-2内容 </div> <div id='article-3' class="news"> article-3内容 </div> <div id='article-4'> article-4内容 </div> </body> </html> EOT; $crawler->addHtmlContent($html,"utf-8"); /*foreach ($crawler as $domElement) { var_dump($domElement->nodeName); }*/ /*$crawler = $crawler->filterXPath('descendant-or-self::body/p'); P($crawler);*/ //$tag = $crawler->filterXPath('//body/div')->nodeName(); //P($tag); //获取第一p标签的值 /* $firstP=$crawler->filter('body > p ')->eq(0)->text(); P($firstP);*/ //获取带有html标签的值 /* $firstP_html=$crawler->filter('body > p ')->eq(0)->html(); P($firstP_html);*/ //获取第一个p标签的值 /*$firstP=$crawler->filter('body > p')->first()->text(); P($firstP);*/ //获取第2个p标签的值 和数组一样 从0 开始计数 /*$secondP=$crawler->filter('body > p ')->eq(1)->text(); P($secondP);*/ //获取最后一个p标签的值 /*$lastP=$crawler->filter('body > p')->last()->text(); P($lastP);*/ //获取b标签的上一级dom 可以连着 上一级 上一级 //获得b 标签的上两级 dom 是 /*$parents=$crawler->filter('b')->parents()->parents()->nodeName(); P($parents);*/ //获取p标签下的b标签值 /*$message = $crawler->filterXPath('//body/p/b')->text(); P($message);*/ //获取div下span标签的值,如果有多个这样的元素,默认获取第一个 /*$message = $crawler->filterXPath('//div/span')->text(); P($message); $message = $crawler->filterXPath('//div/span')->eq(1)->text(); P($message);*/ //获取id 为 test的值 //切记 使用# 或 . 标签名,获取子类目中的元素 使用 > 连接, 切记不要使用 / 会报错的 /*$test_span_val=$crawler->filter('#test>span')->text(); P($test_span_val);*/ /*$_span_val=$crawler->filter('span')->eq(1)->text(); P($_span_val);*/ //获取最后一个div标签的Id是多少 /*$_span_val=$crawler->filter('div')->last()->attr("id"); P($_span_val);*/ /* * * 解释: 获取div 下id 的值包含 article- (contains 包含) 然后 遍历出所有的文档 * **/ $valIds=$crawler->filterXPath('//div[contains(@id,"article-")]')->each(function (Crawler $node, $i) { return $node->text(); }); P($valIds);
dom-crawler 支持两种写法,一种是xpath 写法filterXPath() / 一种是css写法 filter()
这两种写法可以同时使用,也可以单独用。
本人喜欢适用第二种写法 css 方式 filter()
现在我把项目中使用到的一些案例,列举出来,方便日后查看:
$crawler = new Crawler(); $crawler->addHtmlContent($content); $patternUrl = '#(/index.php?[^+\']+)#si'; $patternId = '#cid-(\d+)-id-(\d+).html#'; $liList=$crawler->filter('#oldlist li')->each(function (Crawler $node, $i) use ($patternUrl, $patternId) { $title = $node->filter('.moretitle')->text(); $url = ''; $articleId = 0; $catId = 0; if (preg_match_all($patternUrl, $node->filter('a')->attr('onclick'), $result)) { $url = isset($result[1][0]) ? $result[1][0] : ''; if ($url && preg_match_all($patternId, $url, $idResult)) { $catId = $idResult[1][0]; $articleId = $idResult[2][0]; } } return [ 'title' => $title, 'url' => $url, 'thumb' => '', 'catId' => $catId, 'articleId' => $articleId ]; }); $title = $node->filter('.tjtw-rtitle51')->text(); $src => $node->filter('.tjtw-l img')->attr('data-src'); $date = $crawler->filter('#post-date')->text(); $title = $crawler->filter('title')->text(); $pageContent = $crawler->filter('#page-content')->html();
看到上边的写法以后,是不是会有一种似曾相识的感觉? 像css写样式一样。
$crawler = $crawler->filter('body > p'); $crawler->filter('body')->children('p.lorem'); $crawler->filter('body')->children(); $crawler->filter('body > p')->parents(); $crawler->filter('body > p')->nextAll(); $crawler->filter('body > p')->previousAll(); $crawler->filter('body > p')->siblings(); $crawler->filter('body > p')->first(); $crawler->filter('body > p')->last(); // first, select the link by id, class or content... $linkCrawler = $crawler->filter('#sign-up'); $linkCrawler = $crawler->filter('.user-profile'); $linkCrawler = $crawler->selectLink('Log in'); // ...then, get the Link object: $link = $linkCrawler->link(); // or do all this at once: $link = $crawler->filter('#sign-up')->link(); $link = $crawler->filter('.user-profile')->link(); $link = $crawler->selectLink('Log in')->link(); use Symfony\Component\DomCrawler\Crawler; // ... $crawler = $crawler ->filter('body > p') ->reduce(function (Crawler $node, $i) { // filters every other node return ($i % 2) == 0; }); use Symfony\Component\DomCrawler\Crawler; $html = '<html> <body> <span id="article-100" class="article">Article 1</span> <span id="article-101" class="article">Article 2</span> <span id="article-102" class="article">Article 3</span> </body> </html>'; $crawler = new Crawler(); $crawler->addHtmlContent($html); $crawler->filterXPath('//span[contains(@id, "article-")]')->evaluate('substring-after(@id, "-")'); /* Result: [ 0 => '100', 1 => '101', 2 => '102', ]; */ $crawler->evaluate('substring-after(//span[contains(@id, "article-")]/@id, "-")'); /* Result: [ 0 => '100', ] */ $crawler->filterXPath('//span[@class="article"]')->evaluate('count(@id)'); /* Result: [ 0 => 1.0, 1 => 1.0, 2 => 1.0, ] */
以下是通过网络找到一些案例,支持XPath 的写法
PHP中的XPath支持
在使用xpath的时候,首先安装一个浏览器插件吧,会帮助我们快速定位标签
名字叫XPath Helper
XPath表达式可以查找HTML节点或元素,是一种路径表达语言。
那么需要先学习下XPath的基础,花个1-2小时入门,XPath就是页面数据提取能力的最佳内功之一,这个时间值得花。
既然用XPath提取页面数据是通行的方式,那么PHP中支持XPath的扩展包是什么呢?
为了帮大家节约时间,Symfony DomCrawler 就是PHP中最佳XPath包之一,直接用他吧,Symfony出品质量可是有目共睹,PHP热门框架laravel都用Symfony的包。
Symfony DomCrawler官方文档介绍的实例有限,建议有需要的情况下把代码读下,更能熟知新用法。
<?php require __DIR__ . '/vendor/autoload.php'; use GuzzleHttp\Client; use Symfony\Component\DomCrawler\Crawler; function Spider() { //需要爬取的页面 $url = 'https://movie.douban.com/subject/25812712/?from=showing'; //下载网页内容 $client = new Client([ 'timeout' => 10, 'headers' => ['User-Agent' => 'Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)', ], ]); $response = $client->request('GET', $url)->getBody()->getContents(); //进行XPath页面数据抽取 $data = []; //结构化数据存本数组 $crawler = new Crawler(); $crawler->addHtmlContent($response); try { //电影名称 //网页结构中用css选择器用id的比较容易写xpath表达式 $data['name'] = $crawler->filterXPath('//*[@id="content"]/h1/span[1]')->text(); //电影海报 $data['cover'] = $crawler->filterXPath('//*[@id="mainpic"]/a/img/@src')->text(); //导演 $data['director'] = $crawler->filterXPath('//*[@id="info"]/span[1]/span[2]')->text(); //多个导演处理成数组 $data['director'] = explode('/', $data['director']); //过滤前后空格 $data['director'] = array_map('trim', $data['director']); //编剧 $data['cover'] = $crawler->filterXPath('//*[@id="info"]/span[2]/span[2]/a')->text(); //主演 $data['mactor'] = $crawler->filterXPath('//*[@id="info"]/span[contains(@class,"actor")]/span[contains(@class,"attrs")]')->text(); //多个主演处理成数组 $data['mactor'] = explode('/', $data['mactor']); //过滤前后空格 $data['mactor'] = array_map('trim', $data['mactor']); //上映日期 $data['rdate'] = $crawler->filterXPath('//*[@id="info"]')->text(); //使用正则进行抽取 preg_match_all("/(\d{4})-(\d{2})-(\d{2})\(.*?\)/", $data['rdate'], $rdate); //2017-07-07(中国大陆) / 2017-06-14(安锡动画电影节) / 2017-06-30(美国) $data['rdate'] = $rdate[0]; //简介 //演示使用class选择器的方式 $data['introduction'] = trim($crawler->filterXPath('//div[contains(@class,"indent")]/span')->text()); //演员 //本xpath表达式会得到多个对象结果,用each方法进行遍历 //each是传入的参数是一个闭包,在闭包中使用外部的变量使用use方法,并使用变量指针 $crawler->filterXPath('//ul[contains(@class,"celebrities-list from-subject")]/li')->each(function (Crawler $node, $i) use (&$data) { $actor['name'] = $node->filterXPath('//div[contains(@class,"info")]/span[contains(@class,"name")]/a')->text(); //名字 $actor['role'] = $node->filterXPath('//div[contains(@class,"info")]/span[contains(@class,"role")]')->text(); //角色 $actor['avatar'] = $node->filterXPath('//a/div[contains(@class,"avatar")]/@style')->text(); //头像 //background-image: url(https://img3.doubanio.com/img/celebrity/medium/5253.jpg) 正则抽取头像图片 preg_match_all("/((https|http|ftp|rtsp|mms)?:\/\/)[^\s]+\.(jpg|jpeg|gif|png)/", $actor['avatar'], $avatar); $actor['avatar'] = $avatar[0][0]; //print_r($actor); $data['actor'][] = $actor; }); } catch (\Exception $e) { } return $data; }
恩,好现在再来看看这个xpath是如何使用的:
通过表格获取里面的信息,整理成数组 案例
$crawler = new Crawler(); $content = file_get_contents(__DIR__ . DIRECTORY_SEPARATOR . 'aa.html'); $crawler->addHtmlContent($content, "utf-8"); $crawler->filter('h2')->each(function (Crawler $node, $i) use ($crawler) { //获取此元素后面所有元素 p标签 第一个元素 $path = $node->nextAll()->filter('p')->eq(1)->text(); $tableComment = $node->text(); $pathData = explode("/", trim(trim(str_replace('Path:', '', $path)), '/')); $tableName = str_replace('-', '_', $pathData[1]); $fileName = ucfirst(\Illuminate\Support\Str::camel($tableName)); $filePath = base_path('resources' . DIRECTORY_SEPARATOR . 'model_schemas'); dump($tableName, $tableComment); $jsonData = [ [ "name" => 'id', "dbType" => 'integer,true', "htmlType" => 'text', "validations" => null, "searchable" => true, "fillable" => true, "primary" => false, "inForm" => true, "inIndex" => true, "inView" => true ], [ "name" => 'global_id', "dbType" => 'bigInteger', "htmlType" => 'text', "validations" => null, "searchable" => true, "fillable" => true, "primary" => false, "inForm" => true, "inIndex" => true, "inView" => true ] ]; $sql = [ "drop table if exists {$tableName};", "create table {$tableName}(", "`id` int unsigned NOT NULL primary key auto_increment COMMENT '自增id',", "`global_id` bigint(20) unsigned NOT NULL DEFAULT '0' COMMENT '雪花id global_id',", ]; //获取当前元素后面 所有table中 class 是table-field 的table $tableContent = $node->nextAll()->filter('table.talbe-field')->eq(0) ->filter('tbody tr')->each(function (Crawler $trNode) use (&$jsonData, &$sql) { if ($trNode->filter('tr td')->count() > 0) { //获取tr中td的内容 $fieldName = $trNode->filter('tr td')->eq(0)->text(); $fieldName = \Illuminate\Support\Str::snake($fieldName, '_'); if (!stristr($fieldName, '├─')) { $fieldType = $trNode->filter('tr td')->eq(1)->text(); $validatorTxt = $trNode->filter('tr td')->eq(2)->text(); $validator = $validatorTxt == '非必须' ? null : 'required'; //处理sql $sqlType = $this->getFieldSqlType($trNode->filter('tr td')->eq(5)->text()); $sqlNull = ''; if ($validatorTxt == '必须') { $sqlNull = 'NOT NULL'; } else { if (stristr($sqlType, 'char')) { $sqlNull = "default ''"; } elseif (stristr($sqlType, 'date') || $sqlType == 'text') { $sqlNull = "default null"; } else { $sqlNull = "default 0"; } } $commentStr = $trNode->filter('tr td')->eq(4)->text() . ' ' . $trNode->filter('tr td')->eq(5)->text(); if (trim($fieldType) == 'string') { $dbFieldType = $this->getFieldType($trNode->filter('tr td')->eq(5)->text()); $jsonData[] = $this->getField($fieldName, $dbFieldType, 'text', $validator); $sql[] = "`{$fieldName}` " . $sqlType . " {$sqlNull} COMMENT '{$commentStr}',"; } else { $jsonData[] = $this->getField($fieldName, 'longText', 'text', $validator); $sql[] = "`{$fieldName}` longText COMMENT '{$commentStr}',"; } } } }); $jsonData[] = [ "name" => 'gmt_created', "dbType" => 'dateTime', "htmlType" => 'date', "validations" => 'required', "searchable" => true, "fillable" => true, "primary" => false, "inForm" => true, "inIndex" => true, "inView" => true ]; $jsonData[] = [ "name" => 'gmt_modified', "dbType" => 'dateTime', "htmlType" => 'date', "validations" => 'required', "searchable" => true, "fillable" => true, "primary" => false, "inForm" => true, "inIndex" => true, "inView" => true ]; $jsonData[] = [ "name" => 'is_deleted', "dbType" => 'integer', "htmlType" => 'text', "validations" => '', "searchable" => true, "fillable" => true, "primary" => false, "inForm" => true, "inIndex" => true, "inView" => true ]; $sql[] = "`gmt_created` datetime NOT NULL COMMENT '创建时间',"; $sql[] = "`gmt_modified` datetime NOT NULL COMMENT '修改时间',"; $sql[] = "`is_deleted` tinyint(3) unsigned NOT NULL DEFAULT '0' COMMENT '逻辑删除状态位1:已删除;0未删除',"; $sql[] = "UNIQUE KEY `uniq_id` (`global_id`) USING BTREE"; $sql[] = ")ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='{$tableComment}';"; file_put_contents($filePath . DIRECTORY_SEPARATOR . $fileName . '.sql', implode("\n", $sql)); file_put_contents($filePath . DIRECTORY_SEPARATOR . $fileName . '.json', jsonencode($jsonData)); // exit; });