《PHP应用:PHP制作百度词典查词采集器》要点:
本文介绍了PHP应用:PHP制作百度词典查词采集器,希望对您有用。如果有疑问,可以联系我们。
PHP学习
PHP学习百度dict 采集样本
PHP学习写的采集百度dict词典翻译后的所有结果数据,当然附带了13.5w单词库和采集简单的案例,这里我把写出的主要类dict.class.php放出来,项目地址http://github.com/widuu/baidu_dict,有需要的直接fork就可以了~么么哒,这东西用的人很少,所以有用的兄弟拿走了哈~
- PHP学习
- <?php
- /**
- * dict.class.php 采集百度词典翻译内容
- *
- * @copyright (C) 2014 widuu
- * @license http://www.widuu.com
- * @lastmodify 2014-2-15
- */
- header("content-type:text/html;charset=utf8");
- class Dict{
- private $word;
- //显示的条数
- private static $num = 10;
- public function __construct(){}
- /**
- * 公用返回百度采集数据的办法
- * @param string 英文单词
- * retun array(
- * symbol" => 音标
- * "pro" => 发音
- * "example"=> 例句
- * "explain"=> 简明释义
- * "synonym"=> 同反义词
- * "phrase" => 短语数组
- * )
- *
- */
- public function content($word){
- $this -> word = $word;
- $symbol = $this -> Pronounced();
- $pro = $this->getSay();
- $example = $this -> getExample();
- $explain = $this -> getExplain();
- $synonym = $this -> getSynonym();
- $phrase = $this -> getPhrase();
- $result = array(
- "symbol" => $symbol, //音标
- "pro" => $pro, //发音
- "example"=> $example, //例句
- "explain"=> $explain, //简明释义
- "synonym"=> $synonym, //同反义词
- "phrase" => $phrase //短语数组
- );
- return $result;
- }
- /**
- * 远程获取百度翻译内容
- * get function curl
- * retun string
- *
- */
- private function getContent(){
- $useragent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0";
- $ch = curl_init();
- $url = "http://dict.baidu.com/s?wd=".$this->word;
- curl_setopt($ch, CURLOPT_URL, $url);
- curl_setopt($ch, CURLOPT_USERAGENT,$useragent);
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
- curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
- curl_setopt($ch, CURLOPT_HTTPGET, 1);
- curl_setopt($ch, CURLOPT_AUTOREFERER,1);
- curl_setopt($ch, CURLOPT_HEADER, 0);
- curl_setopt($ch, CURLOPT_TIMEOUT, 30);
- $result = curl_exec($ch);
- if (curl_errno($curl)) {
- echo 'Errno'.curl_error($curl);
- }
- curl_close($ch);
- return $result;
- }
- /**
- * 获取百度翻译发音
- * retun array(英,美)
- *
- */
- private function Pronounced(){
- $data = $this -> getContent();
- preg_match_all("/\"EN\-US\"\>(.*)\<\/b\>/Ui",$data,$pronounced);
- return array(
- 'en' => $pronounced[1][0],
- 'us' => $pronounced[1][1]
- );
- }
- /**
- * 获取百度翻译发音
- * return array(英,美)
- *
- */
- private function getSay(){
- $data = $this -> getContent();
- preg_match_all("/url=\"(.*)\"/Ui",$data,$pronounced);
- return array(
- 'en' => $pronounced[1][0],
- 'us' => $pronounced[1][1]
- );
- }
- /**
- * 获取百度翻译例句
- * return array() 多维数组 例句
- *
- */
- private function getExample(){
- $str = "";
- $data = $this -> getContent();
- preg_match_all("/var example_data = (.*)\]\;/Us",$data,$example);
- $data1 = "[[[".ltrim($example[1][0],"[");
- $data2 = explode("[[[",$data1);
- $num = count(array_filter($data2));
- foreach($data2 as $key => $value){
- $data3 = explode("[[","[[".$value);
- foreach ($data3 as $k => $v) {
- preg_match_all("/\[\"(.*)\",/Us","[".$v, $match);
- if(!empty($match[1])){
- $str .= implode($match[1]," ")."@";
- }
- }
- }
- $data4 = trim($str,"@");
- $data5 = explode("@", $data4);
- $result = array_chunk($data5, 2);
- return $result;
- }
- /**
- * 获取简明释义
- * return array (x => "词性",b => "附属")
- *
- **/
- private function getExplain(){
- $data = $this -> getContent();
- preg_match_all("/id\=\"en\-simple\-means\"\>(.*)\<div(\s+)class\=\"source\"\>/Us",$data,$explain);
- $r_data = $explain[1][0];
- preg_match_all("/\<p\>\<strong\>(?P<adj>.*)\<\/strong\>\<span\>(?P<name>.*)\<\/span\>\<\/p\>/Us", $r_data, $a_data);
- preg_match_all("/\<span\>(?P<tag>[^\>]+)\:\<a(\s+)href\=\"(.*)\"\>(?P<word>.*)\<\/a\>\<\/span\>/Us", $r_data, $b_data);
- $result = array();
- foreach ($a_data["adj"] as $key => $value) {
- $result[$value] = $a_data["name"][$key];
- }
- $word_b = array();
- foreach ($b_data["tag"] as $key => $value) {
- $word_b[$value] = strip_tags($b_data["word"][$key]);
- }
- $result_data = array("x" => $result,"b" => $word_b);
- return $result_data;
- }
- /**
- * 获取同义词
- * return array(0 => "同义词", 1 => "反义词") 一般为多维数组
- *
- */
- private function getSynonym(){
- $data = $this -> getContent();
- preg_match_all("/id=\"en\-syn\-ant\"\>(.*)<div(\s+)class\=\"source\">/Us",$data,$synonym);
- $content = $synonym[1][0];
- $data1 = explode("</dl>", $content);
- $result = array();
- $data2 = array();
- foreach ($data1 as $key => $value) {
- preg_match_all("/\<strong\>(?P<adj>.*)\ \;\<\/strong\>\<\/div\>\<div(\s+)class\=\"syn\-ant\-list\"\>\<ul\>(?<content>.*)\<\/ul\>/Us", $value, $r_data);
- $data2[$key]["adj"] = $r_data["adj"];
- $data2[$key]["content"] = $r_data["content"];
- }
- foreach ($data2 as $key => $value) {
- foreach ($value["content"] as $k => $v) {
- if(!empty($v)){
- preg_match_all("/\<li\>\<p\>(?P<title>.*)\<\/p\>(?P<value>.*)\<\/li>/Us", $v, $v_data);
- foreach ($v_data['title'] as $m => $d) {
- $data = strip_tags(preg_replace("<</a>>"," ", $v_data["value"][$m]));
- $result[$key][$value["adj"][$k]][$d] = $data;
- }
- }
- }
- }
- return $result;
- }
- /**
- * 获取短语词组
- * return array (key => value) 一维或者多维数组
- *
- */
- private function getPhrase(){
- $num = self::$num;
- $data = $this -> getContent();
- preg_match_all("/id=\"en\-phrase\"\>(.*)\<div class\=\"source\"\>/Us",$data,$phrase);
- $data = explode("</dd>",$phrase[1][0]);
- $data1 = array_slice($data,0,$num);
- $result = array();
- foreach ($data1 as $key => $value) {
- $data2 = explode("</p>", $value);
- $n = count($data2);
- if($n<=3){
- $result[str_replace(" ","",strip_tags($data2[0]))] = strip_tags($data2[1]);
- }else{
- $data3 = array_slice($data2,0,$n-1);
- $data4 = array_slice($data2,0,2);
- $res = array_diff($data3,$data4);
- $data5 = array_chunk($res,2);
- $key_value = trim(str_replace(" ","",strip_tags($data4[0])));
- $result[$key_value] = strip_tags($data4[1]);
- foreach ($data5 as $key => $value) {
- foreach ($value as $k => $v) {
- $value[$k] = strip_tags($v);
- }
- $array = array($result[$key_value],$value);
- if (array_key_exists($key_value, $result)){
- $result[$key_value] = $array;
- }
- }
- }
- }
- return $result;
- }
- /**
- * 将数组转换为字符串
- *
- * @param array $data 数组
- * @param bool $isformdata 如果为0,则不使用new_stripslashes处理,可选参数,默认为1
- * @return string 返回字符串,如果,data为空,则返回空
- */
- private function array2string($data, $isformdata = 1) {
- if($data == '') return '';
- if($isformdata) $data = $this->new_stripslashes($data);
- return addslashes(var_export($data, TRUE));
- }
- /**
- * 返回经stripslashes处理过的字符串或数组
- * @param $string 需要处理的字符串或数组
- * @return mixed
- */
- private function new_stripslashes($string) {
- if(!is_array($string)) return stripslashes($string);
- foreach($string as $key => $val) $string[$key] = $this->new_stripslashes($val);
- return $string;
- }
- }
- // $word = new dict("express");
- // $word ->content();
PHP学习以上就是本文的全部内容了,非常实用的功能,希望小伙伴们能够喜欢.
欢迎参与《PHP应用:PHP制作百度词典查词采集器》讨论,分享您的想法,维易PHP学院为您提供专业教程。