CURL函数加正则抓取百度百科信息

2018-08-17T10:52:00

整体的一个思路是用curl函数抓取整个页面的信息,然后用正则对数据进行清洗

<?php
header("Content-Type: text/html;charset=utf-8");
/**
 * 自动抓取html代码并返回
 * @author chunlai
 * @date        2018-08-16
 * @anotherdate 2018-08-16T17:17:37+080
 * @Email       kk@yaochunlai.xin
 * 你放心我不是什么好人
 * @param       string                  $url        url地址
 * @param       boolean                 $proxy_flag 是否开启代理
 * @param       array                   $proxy      代理的一些参数
 */
 function setCurl($url, $proxy_flag=false, $proxy=array())
 {
     $ch = curl_init($url);
     curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
     curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 0);
     curl_setopt($ch, CURLOPT_HEADER, false);
     curl_setopt($ch, CURLOPT_TIMEOUT, 120);
     if ('https' == substr($url, 0, 5)) {
         curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
         curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
     }
     if ($proxy_flag) {
         curl_setopt($ch, CURLOPT_PROXY, $proxy['name'].':'.$proxy['pass'].'@'.$proxy['host']);
         curl_setopt($ch, CURLOPT_PROXYPORT, $proxy['port']);
     }
     $result = curl_exec($ch);
     curl_close($ch);
     return $result;
 }
 function handleBaikeInfo($baike_info)
 {
     $basic_info = array();
     preg_match('/出品公司(.*)\<\/dd\>/U', $baike_info, $match);
     $basic_info['chupin'] = $match[1];
     preg_match('/导演(.*)\<\/dd\>/U', $baike_info, $match);
     $basic_info['director'] = $match[1];
     preg_match('/主演\<\/dt\>(.*)\<\/dd\>/U', $baike_info, $match);
     $basic_info['actor'] = $match[1];
     foreach ($basic_info as $key => $value) {
         $basic_info[$key] = strip_tags($value);
     }
     var_dump($basic_info);
 }
 function trimAll($str)
 {
     $pattern = array('',' ','  ',"\t","\n","\r","&nbsp;");
     $str = str_replace($pattern, '', $str);
     $str = preg_replace('/\x{00a0}/u', '', $str);
     return $str;
 }

  $url = 'https://baike.baidu.com/item/%E6%88%98%E7%8B%BC%E2%85%A1/20794668?fr=aladdin&fromid=17196087&fromtitle=%E6%88%98%E7%8B%BC2';
  $html = setCurl($url);
  $html = trimAll($html);
  $html = handleBaikeInfo($html);
  echo $html;

[title]CURL访问接口API[/title]

什么是API?

类似于一个远程可访问的函数

有接收值、有访问值

应用

第三方API应用

跨语言通信

  function api($url,$params=array(),$method='GET',$header=array()){
    $opts = array(
      CURLOPT_TIMEOUT => 30,
      CURLOPT_RETURNTRANSFER => TRUE,
      CURLOPT_SSL_VERIFYHOST => FALSE,
      CURLOPT_SSL_VERIFYPEER => FALSE,
      CURLOPT_HTTPHEADER => $header,
      CURLOPT_HEADER => FALSE
    );
    switch(strtoupper($method)){
      case 'GET':
        $opts[CURLOPT_URL] = $url.'?'.http_build_query($params);
        break;
      case 'POST':
        $opts[CURLOPT_URL] = $url;
        $opts[CURLOPT_POST] = TRUE;
        $opts[CURLOPT_POSTFIELDS] = $params;
        break;
    }
    $ch = curl_init();
    curl_setopt_array($ch,$opts);
    $result = curl_exec($ch);
    $error = curl_error($ch);
    curl_close($ch);
    if($error){
      echo 'curl执行出错';
    }
     return $result;
  }

  $url = 'http://localhost/get_member_name.php';
  $params = array('id'=>2);
  $ret = api($url,$params);
  echo $ret;
当前页面是本站的「Baidu MIP」版。发表评论请点击:完整版 »
因本文不是用Markdown格式的编辑器书写的,转换的页面可能不符合MIP标准。