作为程序员一定要保持良好的睡眠,才能好编程

php中symfony/dom-crawler使用解释说明

发布时间:2020-05-05

话不多说,因为近期需要做一个项目采集的工作,因此通过composer了解到

symfony/dom-crawler

 symfony/css-selector


这两个配套使用

抓取dom非常的不错,因此今天有幸来使用,并记录下来使用过程


好下面先看看如何使用吧



帮助文档:https://symfony.com/doc/current/components/dom_crawler.html


github:


我使用的是thinkphp5.0的版本 php5.4 那么我引入  dom-crawler 的时候需要注意版本,

下面看看composer中如何配置

第一步:

composer加载 dom-crawler 和 css-selector 两个资源库

"require": {
    
     
    "symfony/dom-crawler":"v2.8.33",
    "symfony/css-selector":"v2.8.33"
},
"symfony/dom-crawler":"v2.8.33",
"symfony/css-selector":"v2.8.33"

这两项是我引入的资源

composer update 通过这个命令把文件包下载到本地




 第二步:实例,并使用


require 'vendor/autoload.php';

$crawler=new \Symfony\Component\DomCrawler\Crawler();

       $html=<<<EOT
   <!DOCTYPE html>
<html>
    <body>
        <p class="message"><b>Hello</b> World!</p>
        <p>Hello Crawler!</p>
        <p>last p!</p>
        <b>bold</b>
        <div>div box</div>
        <div id='test'>
            <span>span标签</span>
        </div>
        <div id='second' class="news">
            <span>second 下span标签</span>
        </div>
         <div id='article-1' class="news">
           article-1内容
        </div>
         <div id='article-2'>
           article-2内容
        </div>
         <div id='article-3' class="news">
           article-3内容
        </div>
         <div id='article-4'>
           article-4内容
        </div>
    </body>
</html>
EOT;

$crawler->addHtmlContent($html,"utf-8");

        /*foreach ($crawler as $domElement) {
            var_dump($domElement->nodeName);
        }*/

        /*$crawler = $crawler->filterXPath('descendant-or-self::body/p');
        P($crawler);*/


        //$tag = $crawler->filterXPath('//body/div')->nodeName();
        //P($tag);


        //获取第一p标签的值
       /* $firstP=$crawler->filter('body > p ')->eq(0)->text();
        P($firstP);*/

        //获取带有html标签的值
       /* $firstP_html=$crawler->filter('body > p ')->eq(0)->html();
        P($firstP_html);*/



        //获取第一个p标签的值
        /*$firstP=$crawler->filter('body > p')->first()->text();
        P($firstP);*/

        //获取第2个p标签的值   和数组一样 从0 开始计数
        /*$secondP=$crawler->filter('body > p ')->eq(1)->text();
        P($secondP);*/

        //获取最后一个p标签的值
        /*$lastP=$crawler->filter('body > p')->last()->text();
        P($lastP);*/


        //获取b标签的上一级dom  可以连着 上一级  上一级
        //获得b 标签的上两级 dom 是
        /*$parents=$crawler->filter('b')->parents()->parents()->nodeName();
        P($parents);*/




        //获取p标签下的b标签值
        /*$message = $crawler->filterXPath('//body/p/b')->text();
        P($message);*/


        //获取div下span标签的值,如果有多个这样的元素,默认获取第一个
        /*$message = $crawler->filterXPath('//div/span')->text();
        P($message);

        $message = $crawler->filterXPath('//div/span')->eq(1)->text();
        P($message);*/


        //获取id 为 test的值
        //切记 使用# 或 . 标签名,获取子类目中的元素 使用 > 连接,  切记不要使用 / 会报错的
        /*$test_span_val=$crawler->filter('#test>span')->text();
        P($test_span_val);*/


        /*$_span_val=$crawler->filter('span')->eq(1)->text();
        P($_span_val);*/

        //获取最后一个div标签的Id是多少
        /*$_span_val=$crawler->filter('div')->last()->attr("id");
        P($_span_val);*/

        /*
         *
         * 解释:  获取div 下id 的值包含 article- (contains 包含) 然后 遍历出所有的文档
         * **/
         
        $valIds=$crawler->filterXPath('//div[contains(@id,"article-")]')->each(function (Crawler $node, $i) {
            return $node->text();
        });
        P($valIds);




dom-crawler 支持两种写法,一种是xpath 写法filterXPath()   /   一种是css写法 filter()


这两种写法可以同时使用,也可以单独用。


本人喜欢适用第二种写法  css  方式 filter()


现在我把项目中使用到的一些案例,列举出来,方便日后查看:

$crawler = new Crawler();
$crawler->addHtmlContent($content);

$patternUrl = '#(/index.php?[^+\']+)#si';
$patternId = '#cid-(\d+)-id-(\d+).html#';

$liList=$crawler->filter('#oldlist li')->each(function (Crawler $node, $i) use ($patternUrl, $patternId) {

  $title = $node->filter('.moretitle')->text();
  $url = '';
  $articleId = 0;
  $catId = 0;

  if (preg_match_all($patternUrl, $node->filter('a')->attr('onclick'), $result)) {
    $url = isset($result[1][0]) ? $result[1][0] : '';
    if ($url && preg_match_all($patternId, $url, $idResult)) {
      $catId = $idResult[1][0];
      $articleId = $idResult[2][0];
    }
  }

 
    return [
      'title' => $title,
      'url' => $url,
      'thumb' => '',
      'catId' => $catId,
      'articleId' => $articleId
    ]; 

});

$title = $node->filter('.tjtw-rtitle51')->text();

$src => $node->filter('.tjtw-l img')->attr('data-src');


$date = $crawler->filter('#post-date')->text();

$title = $crawler->filter('title')->text();

$pageContent = $crawler->filter('#page-content')->html();


看到上边的写法以后,是不是会有一种似曾相识的感觉?  像css写样式一样。




$crawler = $crawler->filter('body > p');


$crawler->filter('body')->children('p.lorem');

$crawler->filter('body')->children();
$crawler->filter('body > p')->parents();

$crawler->filter('body > p')->nextAll();
$crawler->filter('body > p')->previousAll();

$crawler->filter('body > p')->siblings();

$crawler->filter('body > p')->first();
$crawler->filter('body > p')->last();

// first, select the link by id, class or content...
$linkCrawler = $crawler->filter('#sign-up');
$linkCrawler = $crawler->filter('.user-profile');
$linkCrawler = $crawler->selectLink('Log in');

// ...then, get the Link object:
$link = $linkCrawler->link();

// or do all this at once:
$link = $crawler->filter('#sign-up')->link();
$link = $crawler->filter('.user-profile')->link();
$link = $crawler->selectLink('Log in')->link();




use Symfony\Component\DomCrawler\Crawler;
// ...

$crawler = $crawler
->filter('body > p')
->reduce(function (Crawler $node, $i) {
	// filters every other node
	return ($i % 2) == 0;
});


use Symfony\Component\DomCrawler\Crawler;

$html = '<html>
<body>
    <span id="article-100" class="article">Article 1</span>
    <span id="article-101" class="article">Article 2</span>
    <span id="article-102" class="article">Article 3</span>
</body>
</html>';

$crawler = new Crawler();
$crawler->addHtmlContent($html);

$crawler->filterXPath('//span[contains(@id, "article-")]')->evaluate('substring-after(@id, "-")');
/* Result:
[
    0 => '100',
    1 => '101',
    2 => '102',
];
*/

$crawler->evaluate('substring-after(//span[contains(@id, "article-")]/@id, "-")');
/* Result:
[
    0 => '100',
]
*/

$crawler->filterXPath('//span[@class="article"]')->evaluate('count(@id)');
/* Result:
[
    0 => 1.0,
    1 => 1.0,
    2 => 1.0,
]
*/










以下是通过网络找到一些案例,支持XPath 的写法

PHP中的XPath支持

在使用xpath的时候,首先安装一个浏览器插件吧,会帮助我们快速定位标签


xpaht.png

名字叫XPath Helper



XPath表达式可以查找HTML节点或元素,是一种路径表达语言。
那么需要先学习下XPath的基础,花个1-2小时入门,XPath就是页面数据提取能力的最佳内功之一,这个时间值得花。
既然用XPath提取页面数据是通行的方式,那么PHP中支持XPath的扩展包是什么呢?
为了帮大家节约时间,Symfony DomCrawler 就是PHP中最佳XPath包之一,直接用他吧,Symfony出品质量可是有目共睹,PHP热门框架laravel都用Symfony的包。
Symfony DomCrawler官方文档介绍的实例有限,建议有需要的情况下把代码读下,更能熟知新用法。

<?php

require __DIR__ . '/vendor/autoload.php';
 
use GuzzleHttp\Client;
use Symfony\Component\DomCrawler\Crawler;
 

function Spider()
{
    //需要爬取的页面
    $url = 'https://movie.douban.com/subject/25812712/?from=showing';
 
    //下载网页内容
    $client   = new Client([
        'timeout' => 10,
        'headers' => ['User-Agent' => 'Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)',
        ],
    ]);
    $response = $client->request('GET', $url)->getBody()->getContents();
 
    //进行XPath页面数据抽取
    $data    = []; //结构化数据存本数组
    $crawler = new Crawler();
    $crawler->addHtmlContent($response);
 
    try {
        //电影名称
        //网页结构中用css选择器用id的比较容易写xpath表达式
        $data['name'] = $crawler->filterXPath('//*[@id="content"]/h1/span[1]')->text();
        //电影海报
        $data['cover'] = $crawler->filterXPath('//*[@id="mainpic"]/a/img/@src')->text();
        //导演
        $data['director'] = $crawler->filterXPath('//*[@id="info"]/span[1]/span[2]')->text();
        //多个导演处理成数组
        $data['director'] = explode('/', $data['director']);
        //过滤前后空格
        $data['director'] = array_map('trim', $data['director']);
 
        //编剧
        $data['cover'] = $crawler->filterXPath('//*[@id="info"]/span[2]/span[2]/a')->text();
        //主演
        $data['mactor'] = $crawler->filterXPath('//*[@id="info"]/span[contains(@class,"actor")]/span[contains(@class,"attrs")]')->text();
        //多个主演处理成数组
        $data['mactor'] = explode('/', $data['mactor']);
        //过滤前后空格
        $data['mactor'] = array_map('trim', $data['mactor']);
 
        //上映日期
        $data['rdate'] = $crawler->filterXPath('//*[@id="info"]')->text();
        //使用正则进行抽取
        preg_match_all("/(\d{4})-(\d{2})-(\d{2})\(.*?\)/", $data['rdate'], $rdate); //2017-07-07(中国大陆) / 2017-06-14(安锡动画电影节) / 2017-06-30(美国)
        $data['rdate'] = $rdate[0];
        //简介
        //演示使用class选择器的方式
        $data['introduction'] = trim($crawler->filterXPath('//div[contains(@class,"indent")]/span')->text());
 
        //演员
        //本xpath表达式会得到多个对象结果,用each方法进行遍历
        //each是传入的参数是一个闭包,在闭包中使用外部的变量使用use方法,并使用变量指针
        $crawler->filterXPath('//ul[contains(@class,"celebrities-list from-subject")]/li')->each(function (Crawler $node, $i) use (&$data) {
            $actor['name']   = $node->filterXPath('//div[contains(@class,"info")]/span[contains(@class,"name")]/a')->text(); //名字
            $actor['role']   = $node->filterXPath('//div[contains(@class,"info")]/span[contains(@class,"role")]')->text(); //角色
            $actor['avatar'] = $node->filterXPath('//a/div[contains(@class,"avatar")]/@style')->text(); //头像
            //background-image: url(https://img3.doubanio.com/img/celebrity/medium/5253.jpg) 正则抽取头像图片
            preg_match_all("/((https|http|ftp|rtsp|mms)?:\/\/)[^\s]+\.(jpg|jpeg|gif|png)/", $actor['avatar'], $avatar);
            $actor['avatar'] = $avatar[0][0];
            //print_r($actor);
            $data['actor'][] = $actor;
        });
 
    } catch (\Exception $e) {
 
    }
 
    return $data;
 
}




恩,好现在再来看看这个xpath是如何使用的:





通过表格获取里面的信息,整理成数组 案例

  $crawler = new Crawler();

        $content = file_get_contents(__DIR__ . DIRECTORY_SEPARATOR . 'aa.html');
        $crawler->addHtmlContent($content, "utf-8");

        $crawler->filter('h2')->each(function (Crawler $node, $i) use ($crawler) {

            //获取此元素后面所有元素    p标签  第一个元素
            $path = $node->nextAll()->filter('p')->eq(1)->text();
            $tableComment = $node->text();
            
            $pathData = explode("/", trim(trim(str_replace('Path:', '', $path)), '/'));
            $tableName = str_replace('-', '_', $pathData[1]);

            $fileName = ucfirst(\Illuminate\Support\Str::camel($tableName));

            $filePath = base_path('resources' . DIRECTORY_SEPARATOR . 'model_schemas');

            dump($tableName, $tableComment);
            $jsonData = [
                [
                    "name" => 'id',
                    "dbType" => 'integer,true',
                    "htmlType" => 'text',
                    "validations" => null,
                    "searchable" => true,
                    "fillable" => true,
                    "primary" => false,
                    "inForm" => true,
                    "inIndex" => true,
                    "inView" => true
                ],
                [
                    "name" => 'global_id',
                    "dbType" => 'bigInteger',
                    "htmlType" => 'text',
                    "validations" => null,
                    "searchable" => true,
                    "fillable" => true,
                    "primary" => false,
                    "inForm" => true,
                    "inIndex" => true,
                    "inView" => true
                ]
            ];

            $sql = [
                "drop table if exists {$tableName};",
                "create table {$tableName}(",
                "`id` int unsigned NOT NULL primary key auto_increment COMMENT '自增id',",
                "`global_id` bigint(20) unsigned NOT NULL DEFAULT '0' COMMENT '雪花id global_id',",
            ];

            //获取当前元素后面  所有table中 class  是table-field 的table
            $tableContent = $node->nextAll()->filter('table.talbe-field')->eq(0)
                ->filter('tbody tr')->each(function (Crawler $trNode) use (&$jsonData, &$sql) {

                    if ($trNode->filter('tr td')->count() > 0) {
                        //获取tr中td的内容
                        $fieldName = $trNode->filter('tr td')->eq(0)->text();
                        $fieldName = \Illuminate\Support\Str::snake($fieldName, '_');

                        if (!stristr($fieldName, '├─')) {
                            $fieldType = $trNode->filter('tr td')->eq(1)->text();
                            $validatorTxt = $trNode->filter('tr td')->eq(2)->text();
                            $validator = $validatorTxt == '非必须' ? null : 'required';

                            //处理sql
                            $sqlType = $this->getFieldSqlType($trNode->filter('tr td')->eq(5)->text());
                            $sqlNull = '';
                            if ($validatorTxt == '必须') {
                                $sqlNull = 'NOT NULL';
                            } else {
                                if (stristr($sqlType, 'char')) {
                                    $sqlNull = "default ''";
                                } elseif (stristr($sqlType, 'date') || $sqlType == 'text') {
                                    $sqlNull = "default null";
                                } else {
                                    $sqlNull = "default 0";
                                }
                            }
                            $commentStr = $trNode->filter('tr td')->eq(4)->text() . ' ' . $trNode->filter('tr td')->eq(5)->text();
                            if (trim($fieldType) == 'string') {
                                $dbFieldType = $this->getFieldType($trNode->filter('tr td')->eq(5)->text());
                                $jsonData[] = $this->getField($fieldName, $dbFieldType, 'text', $validator);
                                $sql[] = "`{$fieldName}` " . $sqlType . " {$sqlNull} COMMENT '{$commentStr}',";
                            } else {
                                $jsonData[] = $this->getField($fieldName, 'longText', 'text', $validator);
                                $sql[] = "`{$fieldName}` longText COMMENT '{$commentStr}',";
                            }
                        }
                    }
                });

            $jsonData[] = [
                "name" => 'gmt_created',
                "dbType" => 'dateTime',
                "htmlType" => 'date',
                "validations" => 'required',
                "searchable" => true,
                "fillable" => true,
                "primary" => false,
                "inForm" => true,
                "inIndex" => true,
                "inView" => true
            ];
            $jsonData[] = [
                "name" => 'gmt_modified',
                "dbType" => 'dateTime',
                "htmlType" => 'date',
                "validations" => 'required',
                "searchable" => true,
                "fillable" => true,
                "primary" => false,
                "inForm" => true,
                "inIndex" => true,
                "inView" => true
            ];
            $jsonData[] = [
                "name" => 'is_deleted',
                "dbType" => 'integer',
                "htmlType" => 'text',
                "validations" => '',
                "searchable" => true,
                "fillable" => true,
                "primary" => false,
                "inForm" => true,
                "inIndex" => true,
                "inView" => true
            ];

            $sql[] = "`gmt_created` datetime NOT NULL COMMENT '创建时间',";
            $sql[] = "`gmt_modified` datetime NOT NULL COMMENT '修改时间',";
            $sql[] = "`is_deleted` tinyint(3) unsigned NOT NULL DEFAULT '0' COMMENT '逻辑删除状态位1:已删除;0未删除',";
            $sql[] = "UNIQUE KEY `uniq_id` (`global_id`) USING BTREE";
            $sql[] = ")ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='{$tableComment}';";

            file_put_contents($filePath . DIRECTORY_SEPARATOR . $fileName . '.sql', implode("\n", $sql));
            file_put_contents($filePath . DIRECTORY_SEPARATOR . $fileName . '.json', jsonencode($jsonData));
//            exit;
        });