current
authorlace <>
Wed, 14 Jul 2010 11:13:17 +0000 (11:13 +0000)
committerlace <>
Wed, 14 Jul 2010 11:13:17 +0000 (11:13 +0000)
public_html/cgi-bin/idnes-foto

index dfdbd96..2cd8e73 100755 (executable)
@@ -18,11 +18,13 @@ else {
        $BASE=$ARGV[0];
        }
 
+# Somehow Perl modules started reencoding windows-1250 -> utf-8.
+
 if ($ENV{"GATEWAY_INTERFACE"}) {
        my $future=HTTP::Date::time2str(2000000000);
        my $past  =HTTP::Date::time2str(1000000000);
        print <<"EOH";
-Content-type: text/html; charset=windows-1250
+Content-type: text/html; charset=utf-8
 Cache-Control: public
 Expires: $future
 Last-Modified: $past
@@ -32,14 +34,14 @@ EOH
 print <<"EOH";
 <html><head>
 <title>iDNES foto@{[ (!$BASE ? "" : ": $BASE") ]}</title>
-<meta http-equiv="Content-Type" content="text/html; charset=windows-1250">
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
 </head><body>
 <form action="idnes-foto" method="get">
        <input type="text" name="url" size="50" value="$BASE">
        <input type="submit">
 </form>
 EOH
-$BASE=~m{^http://\w+\Q.idnes.cz/foto.asp?\E(?:r=[-\w]+&c=[-\w]+|c=\w+&r=\w+)$} or $BASE="";
+$BASE=~m{^http://\w+\Q.idnes.cz/foto.asp?\E(?:r=[-_\w]+&c=[-_\w]+|c=[-_\w]+&r=[-_\w]+)$} or $BASE="";
 if ($BASE) {
        my $pageno=1;
        my %seen;
@@ -54,24 +56,29 @@ PAGES:      for (;;) {
                        last PAGES if $seen{$base}++;
                        my $infourl=$BASE.'&styl=zoom&foto='.$base;
                        my $info=LWP::Simple::get($infourl) or die $infourl;
-                       $info=~m{<img src="(http://(?:\Qi.idnes.cz\E/\d{2}/\d{3}|\Qimgs.idnes.cz\E/\w+)(?:/\w+)?/[-.\w]+.jpg)" }i
+                       $info=~m{<img\s+id="fotka"\s+src="([^<>"]+)" }is
                                        or die "No image found: $infourl";
                        my $img_src=$1;
                        print <<"EOH";
 <hr>
 <img src="$img_src" border="0">
 EOH
-                       my $text_last=keys(%seen_p_text);;
+                       my $hit;
                        for my $text (
+                                       $info=~m{<span>(.*?)</span>}s,
                                        $info=~m{<p>([^<>]*)</p>},
                                        $info=~m{<div class="text"><!--google_ad_section_start--><h4>([^<>]*)</h4><p>([^<>]*)<!--google_ad_section_end--></p><p>Autor:\s+(.*?)(?:, <a target="_blank" href="http://www.idnes.cz">iDNES.cz</a>)?</p></div>}s,
                                        ) {
                                next if !$text;
+                               $text=~s/^\s+//s;
+                               $text=~s/\s+$//s;
+                               next if !$text;
+                               $hit++;
                                print <<"EOH" if !$seen_p_text{$text}++;
 <p>$text</p>
 EOH
                                }
-                       warn "No text found: $infourl" if $text_last==keys(%seen_p_text);
+                       warn "No text found: $infourl" if !$hit;
                        }
                die $pageurl if !$did;
                $pageno++;