idnes-foto-old1
authorlace <>
Wed, 14 Jul 2010 11:13:07 +0000 (11:13 +0000)
committerlace <>
Wed, 14 Jul 2010 11:13:07 +0000 (11:13 +0000)
public_html/cgi-bin/idnes-foto

index eaa15c9..dfdbd96 100755 (executable)
@@ -2,6 +2,7 @@
 use strict;
 use warnings;
 require LWP::Simple;
 use strict;
 use warnings;
 require LWP::Simple;
+require HTTP::Date;
 require URI::Escape;
 
 $|=1;
 require URI::Escape;
 
 $|=1;
@@ -18,8 +19,13 @@ else {
        }
 
 if ($ENV{"GATEWAY_INTERFACE"}) {
        }
 
 if ($ENV{"GATEWAY_INTERFACE"}) {
+       my $future=HTTP::Date::time2str(2000000000);
+       my $past  =HTTP::Date::time2str(1000000000);
        print <<"EOH";
 Content-type: text/html; charset=windows-1250
        print <<"EOH";
 Content-type: text/html; charset=windows-1250
+Cache-Control: public
+Expires: $future
+Last-Modified: $past
 
 EOH
        }
 
 EOH
        }
@@ -33,7 +39,7 @@ print <<"EOH";
        <input type="submit">
 </form>
 EOH
        <input type="submit">
 </form>
 EOH
-$BASE=~m{^http://\w+\Q.idnes.cz/foto.asp?\E(?:r=\w+&c=\w+|c=\w+&r=\w+)$} or $BASE="";
+$BASE=~m{^http://\w+\Q.idnes.cz/foto.asp?\E(?:r=[-\w]+&c=[-\w]+|c=\w+&r=\w+)$} or $BASE="";
 if ($BASE) {
        my $pageno=1;
        my %seen;
 if ($BASE) {
        my $pageno=1;
        my %seen;
@@ -42,25 +48,30 @@ PAGES:      for (;;) {
                my $pageurl=$BASE.'&strana='.$pageno;
                my $page=LWP::Simple::get($pageurl) or die $pageurl;
                my $did=0;
                my $pageurl=$BASE.'&strana='.$pageno;
                my $page=LWP::Simple::get($pageurl) or die $pageurl;
                my $did=0;
-               while ($page=~m{<img src="(http://(?:\Qi.idnes.cz\E/\d{2}/\d{3}|\Qimgs.idnes.cz\E/\w+))(/midi)?/([-\w]+.jpg)" }gi) {
+               while ($page=~m{<img src="http://(?:\Qi.idnes.cz\E/\d{2}/\d{3}|\Qimgs.idnes.cz\E/\w+)(?:/\w+)?/([-.\w]+.jpg)" }gi) {
                        $did=1;
                        $did=1;
-                       my($start,$mid,$base)=($1,$2,$3);
+                       my($base)=($1);
                        last PAGES if $seen{$base}++;
                        last PAGES if $seen{$base}++;
-                       # FIXME: Sometimes "/maxi" does not exist.
-                       $mid&&="/maxi";
-                       $base=~s/_1M.JPG$/_V.JPG/;
-                       my $maxi=$start.($mid||"")."/".$base;
+                       my $infourl=$BASE.'&styl=zoom&foto='.$base;
+                       my $info=LWP::Simple::get($infourl) or die $infourl;
+                       $info=~m{<img src="(http://(?:\Qi.idnes.cz\E/\d{2}/\d{3}|\Qimgs.idnes.cz\E/\w+)(?:/\w+)?/[-.\w]+.jpg)" }i
+                                       or die "No image found: $infourl";
+                       my $img_src=$1;
                        print <<"EOH";
 <hr>
                        print <<"EOH";
 <hr>
-<img src="$maxi" border="0">
+<img src="$img_src" border="0">
 EOH
 EOH
-                       my $infourl=$BASE.'&styl=zoom&foto='.$base;
-                       my $info=LWP::Simple::get($infourl) or die $infourl;
-                       $info=~m{<p>[^<]*</p>} or die "No text found: $infourl";
-                       my $p_text=$&;
-                       print <<"EOH" if !$seen_p_text{$p_text}++;
-$p_text
+                       my $text_last=keys(%seen_p_text);;
+                       for my $text (
+                                       $info=~m{<p>([^<>]*)</p>},
+                                       $info=~m{<div class="text"><!--google_ad_section_start--><h4>([^<>]*)</h4><p>([^<>]*)<!--google_ad_section_end--></p><p>Autor:\s+(.*?)(?:, <a target="_blank" href="http://www.idnes.cz">iDNES.cz</a>)?</p></div>}s,
+                                       ) {
+                               next if !$text;
+                               print <<"EOH" if !$seen_p_text{$text}++;
+<p>$text</p>
 EOH
 EOH
+                               }
+                       warn "No text found: $infourl" if $text_last==keys(%seen_p_text);
                        }
                die $pageurl if !$did;
                $pageno++;
                        }
                die $pageurl if !$did;
                $pageno++;