-
Notifications
You must be signed in to change notification settings - Fork 0
/
link_krake.pl
139 lines (112 loc) · 3.34 KB
/
link_krake.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/env perl
#
package link_krake;
use strict;
use warnings;
use Data::Dumper;
use URI;
use URL::Normalize;
use HTML::SimpleLinkExtor;
use LWP::UserAgent;
use DBI;
my $ua = LWP::UserAgent->new(agent => "SLK - Schöne Link Krake");
$ua->timeout( 10 );
my $dbh = DBI->connect( "DBI:Pg:host=localhost;db=linkkrake",
"username", "password", { AutoCommit => 1, PrintError => 0, RaiseError => 0 } );
my $MAXCHILDS = 20;
my $childs = 0;
my $sth = $dbh->prepare( "SELECT id, url FROM URLs WHERE scanned != 1 ORDER BY RANDOM() LIMIT 200" );
my $rv = $sth->execute;
while ( $rv ) {
my $maxchilds;
if( $rv < $MAXCHILDS) {
$maxchilds = $rv;
} else {
$maxchilds = $MAXCHILDS;
}
#print "Max: $maxchilds\n";
for ( 1 .. $rv ) {
if ( $childs == $maxchilds ) {
my $pid = wait;
$childs--;
}
my $row = $sth->fetchrow_hashref;
my $pid = fork();
if ( $pid ) {
# Parent
$childs++;
}
elsif ( $pid == 0 ) {
# Child
query_for_links( $dbh, $row );
exit 0;
}
}
$sth = $dbh->prepare( "SELECT id, url FROM URLs WHERE scanned != 1 ORDER BY RANDOM() LIMIT 200" );
$rv = $sth->execute;
}
sub query_for_links {
my ( $dbh, $row ) = @_;
my $sth;
my $url = $row->{url};
my $child_dbh = $dbh->clone;
$dbh->{InactiveDestroy} = 1;
$dbh = undef;
local $SIG{ALRM} = sub { die "Connection Timeout" };
alarm( 30 );
my $head_response = $ua->head( $url );
alarm( 0 );
#if ( $url =~ /([^\s]+(\.(?i)(pdf|ico|svg|css|js|javascript|java|jpg|jpeg|png|gif|bmp|avi|mpeg|mpg|mp4|mp3|mp2|xbm|flash))$)/ )
if ( $head_response->is_success && $head_response->content_type ne 'text/html' ) {
$sth = $child_dbh->prepare( "UPDATE URLs SET scanned = 1, last_scan = NOW() WHERE id = ?" );
$sth->execute( $row->{id} );
return;
}
if ( !$head_response->is_success ) {
$sth = $child_dbh->prepare(
"UPDATE URLs SET scanned = 1, last_scan = NOW(), errors = 1, last_error = NOW() WHERE id = ?" );
$sth->execute( $row->{id} );
print "Error: $url\n";
return;
}
alarm( 30 ); # Define a hard timeout for crawling
#print "\n$url";
my $response = $ua->get( $url ) or next; #die "Could not get '$url'";
alarm( 0 );
unless ( $response->is_success ) {
#die $response->status_line;
alarm( 0 );
return;
}
my $html = $response->decoded_content;
my $extractor = HTML::SimpleLinkExtor->new;
$extractor->parse( $html );
my @links = $extractor->links;
unless ( @links ) {
print "No links found for $url\n";
$sth = $child_dbh->prepare( "UPDATE URLs SET scanned = 1, last_scan = NOW() WHERE id = ?" );
$sth->execute( $row->{id} );
return;
#exit;
}
else {
for my $link ( sort @links ) {
$link = URI->new_abs( $link, $url )
unless ( URI->new( $link )->scheme );
$link =~ s/(.*)#(.*)/$1/;
$link = URI->new( $link )->canonical;
if ( URI->new( $link )->scheme =~ /^https?/ ) {
my $normalizer = URL::Normalize->new( url => $link );
$normalizer->do_all;
$link = $normalizer->get_url;
}
next if URI->new( $link )->eq($url);
#print "$link\n" if (URI->new($link)->scheme =~ /^https?/);
$sth = $child_dbh->prepare( "INSERT INTO URLs (url, scanned) VALUES(?, 0)" );
$sth->execute( $link ) if ( URI->new( $link )->scheme =~ /^https?/ );
}
print $url . " - " . @links . "\n";
}
$sth = $child_dbh->prepare( "UPDATE URLs SET scanned = 1, last_scan = NOW() WHERE id = ?" );
$sth->execute( $row->{id} );
}