current
[nethome.git] / public_html / cgi-bin / idnes-foto
1 #! /usr/bin/perl
2 use strict;
3 use warnings;
4 require LWP::Simple;
5 require HTTP::Date;
6 require URI::Escape;
7
8 $|=1;
9 my $BASE;
10 if (exists $ENV{"QUERY_STRING"}) {
11         $BASE=($ENV{"QUERY_STRING"}=~m{^url=([^&]*)})[0];
12         $BASE=URI::Escape::uri_unescape($BASE);
13         0==@ARGV or die "ARGV count != 0";
14         $BASE||="";
15         }
16 else {
17         1==@ARGV or die "ARGV count != 1";
18         $BASE=$ARGV[0];
19         }
20
21 # Somehow Perl modules started reencoding windows-1250 -> utf-8.
22
23 if ($ENV{"GATEWAY_INTERFACE"}) {
24         my $future=HTTP::Date::time2str(2000000000);
25         my $past  =HTTP::Date::time2str(1000000000);
26         print <<"EOH";
27 Content-type: text/html; charset=utf-8
28 Cache-Control: public
29 Expires: $future
30 Last-Modified: $past
31
32 EOH
33         }
34 print <<"EOH";
35 <html><head>
36 <title>iDNES foto@{[ (!$BASE ? "" : ": $BASE") ]}</title>
37 <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
38 </head><body>
39 <form action="idnes-foto" method="get">
40         <input type="text" name="url" size="50" value="$BASE">
41         <input type="submit">
42 </form>
43 EOH
44 $BASE=~m{^http://\w+\Q.idnes.cz/foto.asp?\E(?:r=[-_\w]+&c=[-_\w]+|c=[-_\w]+&r=[-_\w]+)$} or $BASE="";
45 if ($BASE) {
46         my $pageno=1;
47         my %seen;
48         my %seen_p_text;
49 PAGES:  for (;;) {
50                 my $pageurl=$BASE.'&strana='.$pageno;
51                 my $page=LWP::Simple::get($pageurl) or die $pageurl;
52                 my $did=0;
53                 while ($page=~m{<img src="http://(?:\Qi.idnes.cz\E/\d{2}/\d{3}|\Qimgs.idnes.cz\E/\w+)(?:/\w+)?/([-.\w]+.jpg)" }gi) {
54                         $did=1;
55                         my($base)=($1);
56                         last PAGES if $seen{$base}++;
57                         my $infourl=$BASE.'&styl=zoom&foto='.$base;
58                         my $info=LWP::Simple::get($infourl) or die $infourl;
59                         $info=~m{<img\s+id="fotka"\s+src="([^<>"]+)" }is
60                                         or die "No image found: $infourl";
61                         my $img_src=$1;
62                         print <<"EOH";
63 <hr>
64 <img src="$img_src" border="0">
65 EOH
66                         my $hit;
67                         for my $text (
68                                         $info=~m{<span>(.*?)</span>}s,
69                                         $info=~m{<p>([^<>]*)</p>},
70                                         $info=~m{<div class="text"><!--google_ad_section_start--><h4>([^<>]*)</h4><p>([^<>]*)<!--google_ad_section_end--></p><p>Autor:\s+(.*?)(?:, <a target="_blank" href="http://www.idnes.cz">iDNES.cz</a>)?</p></div>}s,
71                                         ) {
72                                 next if !$text;
73                                 $text=~s/^\s+//s;
74                                 $text=~s/\s+$//s;
75                                 next if !$text;
76                                 $hit++;
77                                 print <<"EOH" if !$seen_p_text{$text}++;
78 <p>$text</p>
79 EOH
80                                 }
81                         warn "No text found: $infourl" if !$hit;
82                         }
83                 die $pageurl if !$did;
84                 $pageno++;
85                 }
86         }
87 print <<"EOH";
88 <hr>
89 <p>EOF</p>
90 </body></html>
91 EOH