最近在做Thinkphp项目的时候需要写一个自动审核数据的功能,根据Curl来抓取对比,并且每隔十秒会自动刷新页面审核!
代码如下:
首先创建:textpageAction.class.php
//检测字符串编码的方法,将编码$code转换为utf-8编码!
function safeEncoding($str){
$code=mb_detect_encoding($str,array('GB2312','GBK','UTF-8','ASCII'));//检测字符串编码
if($code=="CP936"){
$result=$str;
}else{
//$result=mb_convert_encoding($str,'UTF-8',$code);//将编码$code转换为utf-8编码
$result=iconv($code,"UTF-8",$str);
}
return $result;
}
///这个方法可以用来测试用
function ceshi(){
include('simple_html_dom.php');
$ch = curl_init();
$timeout = 10;
curl_setopt ($ch, CURLOPT_URL, "http://bj.ohqly.com/1511/2014011548894568v0.html");
////模式器,如果别人网站拦截了,加上这句话就ok了////
curl_setopt ( $ch, CURLOPT_USERAGENT, 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1)');
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_BINARYTRANSFER, true) ;
curl_setopt($curl, CURLOPT_ENCODING, 'gzip,deflate');
curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
$html = curl_exec($ch);
curl_close($ch);
$html=$this->safeEncoding($html);
var_dump($html);
//dump($html);
$html=str_get_html($html);
foreach($html->find('h1') as $e) {
echo $e->innertext . '<br>';
$val= $e->innertext;
//$val=rtrim($val);
//$val=ltrim($val);
dump(stripos("本田思域 2009款 1.8 VTI 自动豪华版",$val)!==false);
}
exit();
}
///自动审核
function tongguo(){
include('simple_html_dom.php');//这个引文可以在下面下载!
$time = $_REQUEST['where'];
$time = explode(",",$time);
$result = M('textpage');
$where['_string'] = " time >='{$time[0]}' and time <='{$time[1]}'";
$where['status'] = 2;
$renwu = M('textpage');
$list = $result->where($where)->limit(10)->select();
if(!$list){
$this->assign('jump','0');
}else{
$this->assign('jump','1');
} ///$wangzhan=array('liebiao.com','58.com','ohqly.com','ganji.com','lieju.com',
'favolist.com','go007.com','gd8.com','fenlei168.com','kvov.com','ffjzw.com');
foreach($list as $v){
//$html=file_get_contents();
$ch = curl_init();
$timeout = 10;
curl_setopt ($ch, CURLOPT_URL, $v['url']);
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_BINARYTRANSFER, true) ;
curl_setopt($curl, CURLOPT_ENCODING, 'gzip,deflate');
curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
$html = curl_exec($ch);
curl_close($ch);
if($this->safeEncoding($html)==false){
$html=$this->gzdecode($html);
$html=$this->safeEncoding($html);
}else{
$html=$this->safeEncoding($html);
}
//dump($html);die;
if(!$html){
if($html==''){
$data['reason']= "超时";
}else{
$data['reason']= "文章不存在!";
}
$data['status'] = 3;
$con['id'] = $v['id'];
$result = $renwu->where($con)->save($data);
if($result){
echo $v['id'] .$v['textname'] .$v['url'] ."【文章不存在--未通过】<br />";
}
}else{
if(stripos($v['url'],'liebiao.com')!==false){
//$html=$this->gzdecode($html);
}
$html = str_get_html($html);
if(is_object($html)===false){
$data['status'] = 3;
$data['reason']= "链接不存在!";
$con['id'] = $v['id'];
$result = $renwu->where($con)->save($data);
if($result){
echo $v['id'] .$v['textname'] .$v['url'] ."【链接不存在--未通过】<br />";
}
}else{
$tf=false;
if(stripos($v['url'],'lieju.com')!==false){
$tit=$html->find('span');
foreach($tit as $e) {
$val= $e->innertext;
if(stripos($val,$v['textname'])!==false){
$tf=true;
break;
}
}
}else if(stripos($v['url'],'baixing.com')!==false){
foreach($html->find('h2') as $e) {
$val=str_replace(' ','',strip_tags($e->innertext));
//$val=trim(strip_tags($e->innertext));
//$dump($val);die;
if(stripos($val,$v['textname'])!==false){
$tf=true;
break;
}
}
}else{
foreach($html->find('h1') as $e) {
//$val=str_replace(' ','',strip_tags($e->innertext));
//$val= $e->innertext;
$val=trim(strip_tags($e->innertext));/////去空格以及去掉html标签
//$dump($val);die;
if(stripos($val,$v['textname'])!==false){
$tf=true;
break;
}
}
}
if($tf){
$data['status'] = 4;
$data['reason']= "";
$con['id'] = $v['id'];
$result = $renwu->where($con)->save($data);
if($result){
echo $v['id'] .$v['textname'] .$v['url'] ."【通过】<br />";
}
}else{
$data['status'] = 3;
$data['reason']= "标题错误!";
$con['id'] = $v['id'];
$result = $renwu->where($con)->save($data);
if($result){
echo $v['id'] .$v['textname'] .$v['url'] ."【标题错误--未通过】<br />";
}
}
}
}
}
$this->display('tongguo');
}
//gzip压缩编码的解决方法
function gzdecode($data) {
$len = strlen($data);
if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) {
return null; // Not GZIP format (See RFC 1952)
}
$method = ord(substr($data,2,1)); // Compression method
$flags = ord(substr($data,3,1)); // Flags
if ($flags & 31 != $flags) {
// Reserved bits are set -- NOT ALLOWED by RFC 1952
return null;
}
// NOTE: $mtime may be negative (PHP integer limitations)
$mtime = unpack("V", substr($data,4,4));
$mtime = $mtime[1];
$xfl = substr($data,8,1);
$os = substr($data,8,1);
$headerlen = 10;
$extralen = 0;
$extra = "";
if ($flags & 4) {
// 2-byte length prefixed EXTRA data in header
if ($len - $headerlen - 2 < 8) {
return false; // Invalid format
}
$extralen = unpack("v",substr($data,8,2));
$extralen = $extralen[1];
if ($len - $headerlen - 2 - $extralen < 8) {
return false; // Invalid format
}
$extra = substr($data,10,$extralen);
$headerlen += 2 + $extralen;
}
$filenamelen = 0;
$filename = "";
if ($flags & 8) {
// C-style string file NAME data in header
if ($len - $headerlen - 1 < 8) {
return false; // Invalid format
}
$filenamelen = strpos(substr($data,8+$extralen),chr(0));
if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) {
return false; // Invalid format
}
$filename = substr($data,$headerlen,$filenamelen);
$headerlen += $filenamelen + 1;
}
$commentlen = 0;
$comment = "";
if ($flags & 16) {
// C-style string COMMENT data in header
if ($len - $headerlen - 1 < 8) {
return false; // Invalid format
}
$commentlen = strpos(substr($data,8+$extralen+$filenamelen),chr(0));
if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) {
return false; // Invalid header format
}
$comment = substr($data,$headerlen,$commentlen);
$headerlen += $commentlen + 1;
}
$headercrc = "";
if ($flags & 1) {
// 2-bytes (lowest order) of CRC32 on header present
if ($len - $headerlen - 2 < 8) {
return false; // Invalid format
}
$calccrc = crc32(substr($data,0,$headerlen)) & 0xffff;
$headercrc = unpack("v", substr($data,$headerlen,2));
$headercrc = $headercrc[1];
if ($headercrc != $calccrc) {
return false; // Bad header CRC
}
$headerlen += 2;
}
// GZIP FOOTER - These be negative due to PHP's limitations
$datacrc = unpack("V",substr($data,-8,4));
$datacrc = $datacrc[1];
$isize = unpack("V",substr($data,-4));
$isize = $isize[1];
// Perform the decompression:
$bodylen = $len-$headerlen-8;
if ($bodylen < 1) {
// This should never happen - IMPLEMENTATION BUG!
return null;
}
$body = substr($data,$headerlen,$bodylen);
$data = "";
if ($bodylen > 0) {
switch ($method) {
case 8:
// Currently the only supported compression method:
$data = gzinflate($body);
break;
default:
// Unknown compression method
return false;
}
} else {
// I'm not sure if zero-byte body content is allowed.
// Allow it for now... Do nothing...
}
// Verifiy decompressed size and CRC32:
// NOTE: This may fail with large data sizes depending on how
// PHP's integer limitations affect strlen() since $isize
// may be negative for large sizes.
if ($isize != strlen($data) || crc32($data) != $datacrc) {
// Bad format! Length or CRC doesn't match!
return false;
}
return $data;
}
}//end
?>
然后创建模板shenhe.html
<html>
<if condition=" $on neq NULL " ><a href="{:U('/Textpage/tongguo',
array('where'=>$time))}"><h4>自动审核</h4></a></if>
</html>
最后创建模板tongguo.html
<html>
<script type="text/javascript">
var jump="{$jump}";
window.onload = function(){
if(jump=='1'){
setTimeout(function(){
location.reload();
},60000);//60秒自动刷新页面
}else{
alert('审核完毕');
//location.href="{U:(Textpage/submit)}";
}
};
</script>
</html>
![]()
转载请注明转自:运达's blog 原文地址:http://www.yunda51.com/1190.html
博主这个用的事wordpress还是?
用不上,不过还是支持下
你们放假了吗?
@运达: 25号起开始放假
恩,比我们好呀,我们30号!