updates: track new United Media change (two months late )-8 ), use current user's HOME dir
#! /usr/bin/perl
# multi-source comic fetcher
# assumes a sane (eg. non-DOS) environment
# outputs filenames to STDOUT
# possible Y2K problems are marked with the string Y2K - it's
# difficult to know what the dilbert folks are going to do with their
# date format when we tick over.
use Time::Local;
use strict;
no strict 'refs'; # we need symbolic refs for sub-names
use vars qw(@toons $basedir $exitcode $verby @months);
# verbosity should be set from the comand line!
$verby = 0; # 0=quiet(ish), 2=noisy
# basedir contains a directory for each cartoon fetched
# the HOME env var will be exported by cron automatically...
die "HOME is where the blackhole is?" unless $ENV{'HOME'};
$basedir = "$ENV{'HOME'}/public_html/cartoons";
# a list of "source:name". Each source must be defined as a sub FetchSource
@toons = ('UM:dilbert', 'UF:userfriendly');
# bits set: 1=fetch errors, 2=store errors, 4=config error
$exitcode = 0;
$| = 1; $_ = select STDERR; $| = 1; select $_; # autoflush
@months = ('jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec');
# main loop calls FetchBLAH for each strip
# if @ARGV contains anything, then only named cartoons are fetched
unless (-d $basedir or (mkdir $basedir, 0755)) {
print STDERR "mkdir: $basedir: $!\n";
exit 2;
}
foreach (@ARGV) {
print STDERR "Comic '$_' is not defined\n"
unless (join '::', @toons, '') =~ m@\:$_\:\:@i or lc($_) eq 'index';
}
foreach (@toons) {
my ($source, $name) = /^([^:]+):([^:]+)$/;
if (@ARGV and not grep /^$name$/i, @ARGV) {
print STDERR "Skipping $name\n";
next;
}
if (not defined $source or not defined $name) {
print STDERR "Cartoon '$_' doesn't fit 'source:name'\n";
$exitcode |= 4;
next;
}
my $dir = "$basedir/$name";
unless (-d $dir or (mkdir $dir, 0755)) {
print STDERR "mkdir: $dir: $!\n";
$exitcode |= 2;
next;
}
&{"Fetch$source"} ($name, $dir)
unless grep /^index$/i, @ARGV;
# now update the index files
BuildIndex($name, $dir, $source);
}
exit $exitcode;
################ ################ ################
# fetchers for comics
#
sub FetchUM # args = (comic-name, target-directory)
# Fetch named comic from www.unitedmedia.com
# Dies sensibly on fetch failures
# Doesn't fetch things it already has
# Warns if the monthly archive has < 25 entries
{
# index page = http://www.unitedmedia.org/comics/COMIC/archive/
# individual pages = <option value="/comics/COMIC/archive/cal-\d{2}.html">Jan. 01 (was archive/COMICyymmdd.html)
# pictures = /comics/COMIC/archive/images/COMIC\d+.gif
# (pictures have \d+ usually contains yymmdd but sometimes doesn't)
my ($comic, $dir) = (shift, shift);
my $html = FetchURL("http://www.unitedmedia.com/comics/$comic/archive/");
unless (defined $html) {
print STDERR "Fetch failed for United Media: $comic archive\n";
$exitcode |= 1;
return;
}
my %days;
while ($html =~ m@=\s*"(/comics/$comic/archive/$comic-(\d+)\.html)"\s*>@gi) {
$days{$2} = $1;
}
if (scalar keys %days < 25) {
print STDERR "Fetch warning for United Media: $comic archive has only ",
scalar keys %days, " entries\n";
#print STDERR "dump of days >>%days<<\n\n";
$exitcode |= 1;
}
my $daypage;
my $daycode;
foreach $daycode (sort keys %days) {
$daypage = $days{$daycode};
if ($daycode !~ /^(\d{4})(\d{2})(\d{2})$/) {
print STDERR "Fetch warning United Media: can't parse date code '$daycode'\n";
$exitcode |= 1;
next;
}
#TODO .. check it's a valid daycode!
my $day = $daycode;
# do we have it already?
next if -f "$dir/$day.gif";
# fetch the page for the day
$html = FetchURL("http://www.unitedmedia.com$daypage");
unless (defined $html) {
print STDERR "Fetch failed for United Media: $comic: daypage $day\n";
$exitcode |= 1;
next;
}
# extract a (list of) gifs for the day - should be just one
my @gifs = $html =~ m@<img\s+src\s*=\s*"(/comics/$comic/archive/images/$comic\d+\.gif)"@gi;
FetchGifs("$dir/$day", "United Media: $comic: $day", "http://www.unitedmedia.com", @gifs);
}
}
sub FetchUF # args = (comic-name, target-directory)
# Fetch named comic from www.userfriendly.com
# Dies sensibly on fetch failures
# Doesn't fetch things it already has
{
# index page = http://www.userfriendly.org/cartoons/archives/
# individual pages = /cartoons/archive/YYmmm/YYYYMMDD.html
# pictures = /cartoons/YYmmm/\S+.gif
# (later pictures have \S+ == YYYYMMDD but earlier ones look random, newer ones like uf000919.gif)
# Starts with today and goes backwards making URLs for the last $days days
# May or may not be Y2K compliant depending on what userfriendly.org do!
# Likely to find that "today's" cartoon is unavailable or out of date
# if we're in a different timezone
my $days = 333; # checks the last $days days files exist
my ($comic, $dir) = (shift, shift);
my ($month, $day);
die "Can only fetch Userfriendly" unless lc($comic) eq "userfriendly";
# timezone hours offset isn't really needed unless you want to
# fetch as soon as the new one is available
my $now = time() + 3600 * 0;
for ( ; $days >= 0; $days--) {
@_ = localtime($now - $days * 3600 * 24);
$_[5] += 1900;
$month = substr($_[5],2,2) . $months [$_[4]];
$day = $_[5] . two($_[4]+1) . two($_[3]);
if (not -f "$dir/$day.gif") {
print STDERR "Userfriendly '$day' ...\n" if $verby > 0;
my $html = FetchURL("http://www.userfriendly.org/cartoons/archives/$month/$day.html");
unless (defined $html) {
print STDERR "Fetch failed for Userfriendly: $day's page\n";
$exitcode |= 1;
return;
}
print STDERR "... index ...\n" if $verby > 0;
my @gifs = $html =~ m@<img\s+[^>]*src\s*=\s*"(?:http://www.userfriendly.org)?(/cartoons/archives/\d{2}[abcdefgjlmnoprstuvy]{3}/\w+.gif)"@gi;
FetchGifs("$dir/$day", "Userfriendly: $day", "http://www.userfriendly.org", @gifs);
print STDERR "... and picture\n\n" if $verby > 0;
} else { print STDERR "Userfriendly '$day' already exists\n" if $verby > 0 }
}
}
sub two # arg = 0 <= number < 100
{
return $_[0] if $_[0] > 9;
return "0$_[0]";
}
################ ################ ################
# index generator
#
sub BuildIndex # args = (name, directory, cartoon-source)
# update html files for the directory
{
my ($name, $dir, $src) = (shift, shift, shift);
my $chunksize = 14; # days' cartoons per index-chunk
# rather inefficient, but we bin all previous index files at the moment
foreach (glob "$dir/*.html") { unlink }
# for glob of *.gif ... guess dates and if any gifs -m after the -m of
# that (week, month)'s index then remake
# find the oldest cartoon
my $oldest = time();
my @gifs = glob "$dir/*.gif"; # used again later
print STDERR "Indexing $dir: ",(scalar @gifs)," items\n" if $verby;
foreach (@gifs) {
s@^$dir/@@i;
die "Don't understand date format of $dir/$_"
unless /^(\d{2}|\d{4})(\d{2})(\d{2})(-\d+)?.gif$/i;
my ($yr, $mn, $dy) = ($1, ($2)-1, $3); # $4 is just for multiple gifs on one day
$yr += 1900 if $yr < 100; # Y2K problem!
my $then = timelocal(0, 0, 12, $dy, $mn, $yr);
die "Time overflow!" if $then == -1;
$oldest = $then unless $oldest < $then;
}
# work back from the oldest until we hit a Sunday (just for convenience)
while ((localtime($oldest))[6] != 0) { $oldest -= 24 * 3600 }
print STDERR " Starting date: ",scalar localtime($oldest),"\n" if $verby;
# Now work forwards making a indices for each $chunksize days and noting holes
# Also keep an index of chunks in index.html
die "Can't make index $dir/index.html"
unless open IDX, ">$dir/index.html";
print IDX "<html><head>
<meta http-equiv=\"pragma\" content=\"no-cache\">
<title>Index for \u$name</title></head>
<body><a href=\"#bot\">Jump to the bottom</a>
<table border=3 cellpadding=5 cellspacing=0>\n";
my $then;
for ($then=$oldest; $then <= time()+24*3600; $then += $chunksize * 24*3600) {
my ($i, $remake, @toons) = ();
my $present = 0;
# 0 = none found, 1 = early ones missing but otherwise OK,
# 2 = all present so far, 3 = holes exist
for ($i = 0; $i < $chunksize; $i++) {
my $now = $then + $i * 3600 * 24;
@_ = localtime($now);
$_[5] += 1900;
my $day = $_[5] . two($_[4]+1) . two($_[3]);
my $noCEday = substr($day, 2);
my @daystoons = grep /^($day|$noCEday)(-\d+)?.gif$/i, @gifs;
if (@daystoons) {
push @toons, [$now, @daystoons];
if ($i == 0) { $present = 2 }
elsif ($present == 0) { $present = 1 }
} else {
# missing one day
push @toons, [$now];
$present = 3 unless $present == 0; #
}
}
# write the index file for this chunk
my $start;
($_, $start) = BuildIndex_date($then);
my $min_fetcheddays = 365000; # Y2K (um, Y1M) 8-)
if ($present > 0) {
die "Can't make index $dir/index$start.html"
unless open CHKIDX, ">$dir/index$start.html";
my $nextfile = (BuildIndex_date($then + $chunksize * 3600 * 24))[1];
print CHKIDX "<html><head>
<meta http-equiv=\"pragma\" content=\"no-cache\">
<title>Index for \u$name : $_</title></head>
<body bgcolor=#ffffff><a name=\"top\"></a>";
print CHKIDX "<a href=\"index$nextfile.html#bot\">Next</a>\n"
unless $then + $chunksize * 3600*24 > time();
print CHKIDX "<a href=\"index.html\">Return to index</a>
<table border=3 cellpadding=50 cellspacing=0>\n";
foreach (reverse @toons) {
my $when = shift @$_;
if ($when < time() or @$_ != 0) {
$when = scalar localtime($when);
$when =~ s/\s*(\d{2}:){2}\d{2}//;
print CHKIDX "<tr><td>$when<br>";
my $pic;
if (@$_) {
foreach $pic (@$_) {
my $age = -M "$dir/$pic";
$min_fetcheddays = $age unless $min_fetcheddays < $age;
print CHKIDX "<img src=\"$pic\"> ";
}
} else {
print CHKIDX "<em>Missing</em>";
}
print CHKIDX "\n<a name=\"bot\"></a>" if $_ eq $toons[0];
print CHKIDX "</td></tr>\n";
}
}
print CHKIDX "</table><a href=\"#top\">Up to top</a>\n</body></html>\n";
close CHKIDX;
}
print STDERR " Chunk ",(scalar localtime($then))," : present = $present\n" if $verby;
if ($present == 0) { print IDX "<tr><td>$_</td><td>" }
else { print IDX "<tr><td><a href=\"index$start.html#bot\">$_</a></td><td>" }
print IDX "State: ", ('<b>none</b>', 'first ones missing', 'OK', '<b>holes!</b>')[$present];
$min_fetcheddays = int ($min_fetcheddays*10) / 10;
if ($present == 0) {
print IDX "</td></tr>\n";
} else {
print IDX "</td><td>Newest fetched $min_fetcheddays day".($min_fetcheddays==1?'':'s')." ago";
print IDX "<img src=\"../new.gif\" alt=\"[new!]\">" if $min_fetcheddays < 3;
print IDX "</td></tr>\n";
}
}
print IDX "</table><a href=\"..\">Up to index</a><a name=\"bot\"</a></body></html>\n";
close IDX;
}
sub BuildIndex_date # arg = (time-since-epoch)
# returns human readable and compacted versions
{
my $then = shift;
@_ = localtime($then);
$_[5] += 1900;
my $start = "$_[3]$months[$_[4]]$_[5]";
my $spaced = "$start";
$spaced =~ s/^(\d+)([a-z]+)(\d+)$/$1 \u$2 $3/i;
return ($spaced, $start);
}
################ ################ ################
# implementation subs
#
sub DumpFile # args = (filename, binary-data)
# prints filename on success, warns and set exitcode on failure
{
my ($fn, $data) = (shift, shift);
if (not open F, ">$fn") {
print STDERR "open: $fn: $!\n";
$exitcode |= 2;
return;
}
if (not print F $data) {
print STDERR "print: $fn: unspecified failure (disk full?)\n";
$exitcode |= 2;
} else {
print "$fn\n";
}
close F;
}
sub FetchURL # args = (URL)
# Returns document, or undef for failure
# Sends error details to STDERR
{
my $url = shift;
print STDERR "Fetching $url...\n"
if $verby > 1;
# my $buff = `lynx -mime_header \Q$url`;
# my ($err_num, $err_str) = ($?, $!);
# Create a user agent object
use LWP::UserAgent;
my $ua = new LWP::UserAgent;
$ua->agent("ComicFetch/0.1 " . $ua->agent);
$ua->env_proxy();
my $req = new HTTP::Request GET => $url; # Create a request
my $res = $ua->request($req); # Pass request to the user agent and get a response back
# Check the outcome of the response
unless ($res->is_success) {
print STDERR "Failed fetching $url: ".($res->status_line())."\n";
return undef;
}
my $buff = $res->content();
my $headers = $res->headers_as_string();
# check we didn't provoke the wwwoffled
if ($headers =~ /wwwoffle/i) {
print STDERR "comicfetch: wwwoffle intercepted: $url\n";
return undef;
}
# # check we got an HTTP header
# if (not $headers =~ /^HTTP (?:\/[\d.]+)? \s+ (\d+) (\s+.+)? \s*\n/x ){
# $headers =~ s#[\n\r]#/#; # kill the linefeeds
# print STDERR "No HTTP fetch code for $url, headers='$headers'";
# return undef;
# }
# if ($1 != 200) { # the HTTP answer code
# print STDERR "Received 'HTTP $1 $2$3' for '$url' : headers = >>$headers<<\n";
# return undef;
# }
# extract the last-modified: date or whatever here...
if ($headers =~ /^CONTENT-LENGTH:\s+(\d+)/im) {
my $exp_len = $1;
if ($exp_len != length $buff) {
print STDERR "Content of $url is ", (length $buff)," bytes but should be $exp_len\n";
# my $buff = `lynx -source \Q$url`;
# my ($err_num, $err_str) = ($?, $!);
# if ($? == 0 and length($buff) == $exp_len) {
# $headers = "[no headers on second fetch]\n";
# print STDERR "[but it looks OK on the second pass. Lynx is weird\n";
# } else {
return undef;
# }
}
}
if ($headers =~ m@^CONTENT-TYPE:\s+TEXT/HTML@im) {
# fix the \r\n's
$buff =~ s/\r\n/\n/g; # turn \r\n into \n (DOS html files, ick)
$buff =~ s/\r/\n/g; # CRs are nasty anyway. Let's kill them!
}
print STDERR "Fetched $url\n"
if $verby > 1;
return $buff;
}
sub FetchGifs # args as in first line 8-)
# Fetch approximately one .gif, complain if more or less than one
{
my ($fn, $title, $urlprefix, @gifs) = @_;
unless (@gifs == 1) {
print STDERR "Fetch warning for $title: ", scalar @gifs,
" 'img src' items found\n";
$exitcode |= 1 if @gifs == 0;
}
my $gif;
for ($gif=0; $gif < @gifs; $gif++) {
my $html = FetchURL("$urlprefix$gifs[$gif]");
unless (defined $html) {
print STDERR "Fetch failed for $title: gif ",($gif+1),"\n";
$exitcode |= 1;
next;
}
# dump it out to disk
DumpFile($fn .($gif!=0?"-$gif":''). '.gif', $html); # reports its own errors
}
}
|
Repository owner Powered by ViewCVS 1.0-dev |
ViewCVS and CVS Help |