最近在做Thinkphp项目的时候需要写一个自动审核数据的功能,根据Curl来抓取对比,并且每隔十秒会自动刷新页面审核!
代码如下:
首先创建:textpageAction.class.php
//检测字符串编码的方法,将编码$code转换为utf-8编码! function safeEncoding($str){ $code=mb_detect_encoding($str,array('GB2312','GBK','UTF-8','ASCII'));//检测字符串编码 if($code=="CP936"){ $result=$str; }else{ //$result=mb_convert_encoding($str,'UTF-8',$code);//将编码$code转换为utf-8编码 $result=iconv($code,"UTF-8",$str); } return $result; } ///这个方法可以用来测试用 function ceshi(){ include('simple_html_dom.php'); $ch = curl_init(); $timeout = 10; curl_setopt ($ch, CURLOPT_URL, "http://bj.ohqly.com/1511/2014011548894568v0.html"); ////模式器,如果别人网站拦截了,加上这句话就ok了//// curl_setopt ( $ch, CURLOPT_USERAGENT, 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1)'); curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl, CURLOPT_BINARYTRANSFER, true) ; curl_setopt($curl, CURLOPT_ENCODING, 'gzip,deflate'); curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT, $timeout); $html = curl_exec($ch); curl_close($ch); $html=$this->safeEncoding($html); var_dump($html); //dump($html); $html=str_get_html($html); foreach($html->find('h1') as $e) { echo $e->innertext . '<br>'; $val= $e->innertext; //$val=rtrim($val); //$val=ltrim($val); dump(stripos("本田思域 2009款 1.8 VTI 自动豪华版",$val)!==false); } exit(); } ///自动审核 function tongguo(){ include('simple_html_dom.php');//这个引文可以在下面下载! $time = $_REQUEST['where']; $time = explode(",",$time); $result = M('textpage'); $where['_string'] = " time >='{$time[0]}' and time <='{$time[1]}'"; $where['status'] = 2; $renwu = M('textpage'); $list = $result->where($where)->limit(10)->select(); if(!$list){ $this->assign('jump','0'); }else{ $this->assign('jump','1'); } ///$wangzhan=array('liebiao.com','58.com','ohqly.com','ganji.com','lieju.com', 'favolist.com','go007.com','gd8.com','fenlei168.com','kvov.com','ffjzw.com'); foreach($list as $v){ //$html=file_get_contents(); $ch = curl_init(); $timeout = 10; curl_setopt ($ch, CURLOPT_URL, $v['url']); curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl, CURLOPT_BINARYTRANSFER, true) ; curl_setopt($curl, CURLOPT_ENCODING, 'gzip,deflate'); curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT, $timeout); $html = curl_exec($ch); curl_close($ch); if($this->safeEncoding($html)==false){ $html=$this->gzdecode($html); $html=$this->safeEncoding($html); }else{ $html=$this->safeEncoding($html); } //dump($html);die; if(!$html){ if($html==''){ $data['reason']= "超时"; }else{ $data['reason']= "文章不存在!"; } $data['status'] = 3; $con['id'] = $v['id']; $result = $renwu->where($con)->save($data); if($result){ echo $v['id'] .$v['textname'] .$v['url'] ."【文章不存在--未通过】<br />"; } }else{ if(stripos($v['url'],'liebiao.com')!==false){ //$html=$this->gzdecode($html); } $html = str_get_html($html); if(is_object($html)===false){ $data['status'] = 3; $data['reason']= "链接不存在!"; $con['id'] = $v['id']; $result = $renwu->where($con)->save($data); if($result){ echo $v['id'] .$v['textname'] .$v['url'] ."【链接不存在--未通过】<br />"; } }else{ $tf=false; if(stripos($v['url'],'lieju.com')!==false){ $tit=$html->find('span'); foreach($tit as $e) { $val= $e->innertext; if(stripos($val,$v['textname'])!==false){ $tf=true; break; } } }else if(stripos($v['url'],'baixing.com')!==false){ foreach($html->find('h2') as $e) { $val=str_replace(' ','',strip_tags($e->innertext)); //$val=trim(strip_tags($e->innertext)); //$dump($val);die; if(stripos($val,$v['textname'])!==false){ $tf=true; break; } } }else{ foreach($html->find('h1') as $e) { //$val=str_replace(' ','',strip_tags($e->innertext)); //$val= $e->innertext; $val=trim(strip_tags($e->innertext));/////去空格以及去掉html标签 //$dump($val);die; if(stripos($val,$v['textname'])!==false){ $tf=true; break; } } } if($tf){ $data['status'] = 4; $data['reason']= ""; $con['id'] = $v['id']; $result = $renwu->where($con)->save($data); if($result){ echo $v['id'] .$v['textname'] .$v['url'] ."【通过】<br />"; } }else{ $data['status'] = 3; $data['reason']= "标题错误!"; $con['id'] = $v['id']; $result = $renwu->where($con)->save($data); if($result){ echo $v['id'] .$v['textname'] .$v['url'] ."【标题错误--未通过】<br />"; } } } } } $this->display('tongguo'); } //gzip压缩编码的解决方法 function gzdecode($data) { $len = strlen($data); if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) { return null; // Not GZIP format (See RFC 1952) } $method = ord(substr($data,2,1)); // Compression method $flags = ord(substr($data,3,1)); // Flags if ($flags & 31 != $flags) { // Reserved bits are set -- NOT ALLOWED by RFC 1952 return null; } // NOTE: $mtime may be negative (PHP integer limitations) $mtime = unpack("V", substr($data,4,4)); $mtime = $mtime[1]; $xfl = substr($data,8,1); $os = substr($data,8,1); $headerlen = 10; $extralen = 0; $extra = ""; if ($flags & 4) { // 2-byte length prefixed EXTRA data in header if ($len - $headerlen - 2 < 8) { return false; // Invalid format } $extralen = unpack("v",substr($data,8,2)); $extralen = $extralen[1]; if ($len - $headerlen - 2 - $extralen < 8) { return false; // Invalid format } $extra = substr($data,10,$extralen); $headerlen += 2 + $extralen; } $filenamelen = 0; $filename = ""; if ($flags & 8) { // C-style string file NAME data in header if ($len - $headerlen - 1 < 8) { return false; // Invalid format } $filenamelen = strpos(substr($data,8+$extralen),chr(0)); if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) { return false; // Invalid format } $filename = substr($data,$headerlen,$filenamelen); $headerlen += $filenamelen + 1; } $commentlen = 0; $comment = ""; if ($flags & 16) { // C-style string COMMENT data in header if ($len - $headerlen - 1 < 8) { return false; // Invalid format } $commentlen = strpos(substr($data,8+$extralen+$filenamelen),chr(0)); if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) { return false; // Invalid header format } $comment = substr($data,$headerlen,$commentlen); $headerlen += $commentlen + 1; } $headercrc = ""; if ($flags & 1) { // 2-bytes (lowest order) of CRC32 on header present if ($len - $headerlen - 2 < 8) { return false; // Invalid format } $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff; $headercrc = unpack("v", substr($data,$headerlen,2)); $headercrc = $headercrc[1]; if ($headercrc != $calccrc) { return false; // Bad header CRC } $headerlen += 2; } // GZIP FOOTER - These be negative due to PHP's limitations $datacrc = unpack("V",substr($data,-8,4)); $datacrc = $datacrc[1]; $isize = unpack("V",substr($data,-4)); $isize = $isize[1]; // Perform the decompression: $bodylen = $len-$headerlen-8; if ($bodylen < 1) { // This should never happen - IMPLEMENTATION BUG! return null; } $body = substr($data,$headerlen,$bodylen); $data = ""; if ($bodylen > 0) { switch ($method) { case 8: // Currently the only supported compression method: $data = gzinflate($body); break; default: // Unknown compression method return false; } } else { // I'm not sure if zero-byte body content is allowed. // Allow it for now... Do nothing... } // Verifiy decompressed size and CRC32: // NOTE: This may fail with large data sizes depending on how // PHP's integer limitations affect strlen() since $isize // may be negative for large sizes. if ($isize != strlen($data) || crc32($data) != $datacrc) { // Bad format! Length or CRC doesn't match! return false; } return $data; } }//end ?>
然后创建模板shenhe.html
<html> <if condition=" $on neq NULL " ><a href="{:U('/Textpage/tongguo', array('where'=>$time))}"><h4>自动审核</h4></a></if> </html>
最后创建模板tongguo.html
<html> <script type="text/javascript"> var jump="{$jump}"; window.onload = function(){ if(jump=='1'){ setTimeout(function(){ location.reload(); },60000);//60秒自动刷新页面 }else{ alert('审核完毕'); //location.href="{U:(Textpage/submit)}"; } }; </script> </html>
转载请注明转自:运达's blog 原文地址:http://www.yunda51.com/1190.html
博主这个用的事wordpress还是?
用不上,不过还是支持下
你们放假了吗?
@运达: 25号起开始放假
恩,比我们好呀,我们30号!