forked from ttodua/useful-php-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget-remote-url-content-data.php
149 lines (148 loc) · 9.02 KB
/
get-remote-url-content-data.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
<?php
/*
##############################################################################################
############### useful PHP cURL function for your library (TT's version) #####################
##############################################################################################
### echo get_remote_data("http://example.com/"); //GET request
### echo get_remote_data("http://example.com/", "var2=something&var3=blabla" ); //POST request
###
### * Automatically handles FOLLOWLOCATION problem;
### * Using 'replace_src'=>true, it fixes domain-relative urls (i.e.: src="./file.jpg" -----> src="http://example.com/file.jpg" )
### * Using 'schemeless'=>true, it converts urls in schemeless (i.e.: src="http://exampl.. -----> src="//exampl... )\
###### // source: https://github.com/tazotodua/useful-php-scripts ##########
###### // Get minified code from: http://protectpages.com/tools/php-minify.php ##########
###########################################################################################
*/
function get_remote_data($url, $post_paramtrs=false, $extra=array('schemeless'=>true, 'replace_src'=>true, 'return_array'=>false, "curl_opts"=>[]))
{
$c = curl_init();
curl_setopt($c, CURLOPT_URL, $url);
curl_setopt($c, CURLOPT_RETURNTRANSFER, 1);
//if parameters were passed to this function, then transform into POST method.. (if you need GET request, then simply change the passed URL)
if($post_paramtrs){ curl_setopt($c, CURLOPT_POST,TRUE); curl_setopt($c, CURLOPT_POSTFIELDS, (is_array($post_paramtrs)? http_build_query($post_paramtrs) : $post_paramtrs) ); }
curl_setopt($c, CURLOPT_SSL_VERIFYHOST,false);
curl_setopt($c, CURLOPT_SSL_VERIFYPEER,false);
curl_setopt($c, CURLOPT_COOKIE, 'CookieName1=Value;');
$headers[]= "User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:76.0) Gecko/20100101 Firefox/76.0"; $headers[]= "Pragma: "; $headers[]= "Cache-Control: max-age=0";
if (!empty($post_paramtrs) && !is_array($post_paramtrs) && is_object(json_decode($post_paramtrs))){ $headers[]= 'Content-Type: application/json'; $headers[]= 'Content-Length: '.strlen($post_paramtrs); }
curl_setopt($c, CURLOPT_HTTPHEADER, $headers);
curl_setopt($c, CURLOPT_MAXREDIRS, 10);
//if SAFE_MODE or OPEN_BASEDIR is set,then FollowLocation cant be used.. so...
$follow_allowed= ( ini_get('open_basedir') || ini_get('safe_mode')) ? false:true; if ($follow_allowed){curl_setopt($c, CURLOPT_FOLLOWLOCATION, 1);}
curl_setopt($c, CURLOPT_CONNECTTIMEOUT, 9);
curl_setopt($c, CURLOPT_REFERER, $url);
curl_setopt($c, CURLOPT_TIMEOUT, 60);
curl_setopt($c, CURLOPT_AUTOREFERER, true);
curl_setopt($c, CURLOPT_ENCODING, 'gzip,deflate');
curl_setopt($c, CURLOPT_HEADER, !empty($extra['return_array']));
//set extra options if passed
if(!empty($extra['curl_opts'])) foreach($extra['curl_opts'] as $key=>$value) curl_setopt($c, constant($key), $value);
$data = curl_exec($c);
if(!empty($extra['return_array'])) {
preg_match("/(.*?)\r\n\r\n((?!HTTP\/\d\.\d).*)/si",$data, $x); preg_match_all('/(.*?): (.*?)\r\n/i', trim('head_line: '.$x[1]), $headers_, PREG_SET_ORDER); foreach($headers_ as $each){ $header[$each[1]] = $each[2]; } $data=trim($x[2]);
}
$status=curl_getinfo($c); curl_close($c);
// if redirected, then get that redirected page
if($status['http_code']==301 || $status['http_code']==302) {
//if we FOLLOWLOCATION was not allowed, then re-get REDIRECTED URL
//p.s. WE dont need "else", because if FOLLOWLOCATION was allowed, then we wouldnt have come to this place, because 301 could already auto-followed by curl :)
if (!$follow_allowed){
//if REDIRECT URL is found in HEADER
if(empty($redirURL)){if(!empty($status['redirect_url'])){$redirURL=$status['redirect_url'];}}
//if REDIRECT URL is found in RESPONSE
if(empty($redirURL)){preg_match('/(Location:|URI:)(.*?)(\r|\n)/si', $data, $m); if (!empty($m[2])){ $redirURL=$m[2]; } }
//if REDIRECT URL is found in OUTPUT
if(empty($redirURL)){preg_match('/moved\s\<a(.*?)href\=\"(.*?)\"(.*?)here\<\/a\>/si',$data,$m); if (!empty($m[1])){ $redirURL=$m[1]; } }
//if URL found, then re-use this function again, for the found url
if(!empty($redirURL)){$t=debug_backtrace(); return call_user_func( $t[0]["function"], trim($redirURL), $post_paramtrs);}
}
}
// if not redirected,and nor "status 200" page, then error..
elseif ( $status['http_code'] != 200 ) { $data = "ERRORCODE22 with $url<br/><br/>Last status codes:".json_encode($status)."<br/><br/>Last data got:$data";}
//URLS correction
if(function_exists('url_corrections_for_content_HELPER')){ $data= url_corrections_for_content_HELPER($data, $status['url'], array('schemeless'=>!empty($extra['schemeless']), 'replace_src'=>!empty($extra['replace_src']), 'rawgit_replace'=>!empty($extra['rawgit_replace']) ) ); }
$answer = ( !empty($extra['return_array']) ? array('data'=>$data, 'header'=>$header, 'info'=>$status) : $data);
return $answer; } function url_corrections_for_content_HELPER( $content=false, $url=false, $extra_opts=array('schemeless'=>false, 'replace_src'=>false, 'rawgit_replace'=>false) ) {
$GLOBALS['rdgr']['schemeless'] =$extra_opts['schemeless'];
$GLOBALS['rdgr']['replace_src']=$extra_opts['replace_src'];
$GLOBALS['rdgr']['rawgit_replace']=$extra_opts['rawgit_replace'];
if($GLOBALS['rdgr']['schemeless'] || $GLOBALS['rdgr']['replace_src'] ) {
if($url) {
$GLOBALS['rdgr']['parsed_url'] = parse_url($url);
$GLOBALS['rdgr']['urlparts']['domain_X']= $GLOBALS['rdgr']['parsed_url']['scheme'].'://'.$GLOBALS['rdgr']['parsed_url']['host'];
$GLOBALS['rdgr']['urlparts']['path_X'] = stripslashes(dirname($GLOBALS['rdgr']['parsed_url']['path']).'/');
$GLOBALS['rdgr']['all_protocols']= array('adc','afp','amqp','bacnet','bittorrent','bootp','camel','dict','dns','dsnp','dhcp','ed2k','empp','finger','ftp','gnutella','gopher','http','https','imap','irc','isup','javascript','ldap','mime','msnp','map','modbus','mosh','mqtt','nntp','ntp','ntcip','openadr','pop3','radius','rdp','rlogin','rsync','rtp','rtsp','ssh','sisnapi','sip','smtp','snmp','soap','smb','ssdp','stun','tup','telnet','tcap','tftp','upnp','webdav','xmpp');
}
$GLOBALS['rdgr']['ext_array'] = array(
'src' => array('audio','embed','iframe','img','input','script','source','track','video'),
'srcset'=> array('source'),
'data' => array('object'),
'href' => array('link','area','a'),
'action'=> array('form')
//'param', 'applet' and 'base' tags are exclusion, because of a bit complex structure
);
$content= preg_replace_callback(
'/<(((?!<).)*?)>/si', //avoids unclosed & closing tags
function($matches_A){
$content_A = $matches_A[0];
$tagname = preg_match('/((.*?)(\s|$))/si', $matches_A[1], $n) ? $n[2] : "";
foreach($GLOBALS['rdgr']['ext_array'] as $key=>$value){
if(in_array($tagname,$value)){
preg_match('/ '.$key.'=(\'|\")/i', $content_A, $n);
if(!empty($n[1])){
$GLOBALS['rdgr']['aphostrope_type']= $n[1];
$content_A = preg_replace_callback(
'/( '.$key.'='.$GLOBALS['rdgr']['aphostrope_type'].')(.*?)('.$GLOBALS['rdgr']['aphostrope_type'].')/i',
function($matches_B){
$full_link = $matches_B[2];
//correction to files/urls
if(!empty($GLOBALS['rdgr']['replace_src']) ){
//if not schemeless url
if(substr($full_link, 0,2) != '//'){
$replace_src_allow=true;
//check if the link is a type of any special protocol
foreach($GLOBALS['rdgr']['all_protocols'] as $each_protocol){
//if protocol found - dont continue
if(substr($full_link, 0, strlen($each_protocol)+1) == $each_protocol.':'){
$replace_src_allow=false; break;
}
}
if($replace_src_allow){
$full_link = $GLOBALS['rdgr']['urlparts']['domain_X']. (str_replace('//','/', $GLOBALS['rdgr']['urlparts']['path_X'].$full_link) );
}
}
}
//replace http(s) with sheme-less urls
if(!empty($GLOBALS['rdgr']['schemeless'])){
$full_link=str_replace( array('https://','http://'), '//', $full_link);
}
//replace github mime
if(!empty($GLOBALS['rdgr']['rawgit_replace'])){
$full_link= str_replace('//raw.github'.'usercontent.com/','//rawgit.com/', $full_link);
}
$matches_B[2]=$full_link;
unset($matches_B[0]);
$content_B=''; foreach ($matches_B as $each){$content_B .= $each; }
return $content_B;
},
$content_A
);
}
}
}
return $content_A;
},
$content
);
$content= preg_replace_callback(
'/style="(.*?)background(\-image|)(.*?|)\:(.*?|)url\((\'|\"|)(.*?)(\'|\"|)\)/i',
function($matches_A){
$url = $matches_A[7];
$url = (substr($url,0,2)=='//' || substr($url,0,7)=='http://' || substr($url,0,8)=='https://' ? $url : '#');
return 'style="'.$matches_A[1].'background'.$matches_A[2].$matches_A[3].':'.$matches_A[4].'url('.$url.')'; //$matches_A[5] is url taged ,7 is url
},
$content
);
}
return $content;
}