[mca1001] / cronutils / comicfetch.pl  

mca1001: cronutils/comicfetch.pl

File: [mca1001] / cronutils / comicfetch.pl (download) (as text)
Revision: 1.2, Wed Feb 14 22:55:07 2001 UTC (9 years, 6 months ago) by mca1001
Branch: MAIN
CVS Tags: REL_2002_06_20, REL_2002_07_20, REL_2001_05_12, REL_2002_06_19, REL_2002_07_10, REL_2001_04_02, REL_2003_02_03, PUBLIC, HEAD
Changes since 1.1: +10 -11 lines
updates: track new United Media change (two months late )-8 ),
 use current user's HOME dir

#! /usr/bin/perl

# multi-source comic fetcher
# assumes a sane (eg. non-DOS) environment
# outputs filenames to STDOUT

# possible Y2K problems are marked with the string Y2K - it's
# difficult to know what the dilbert folks are going to do with their
# date format when we tick over.

use Time::Local;
use strict;
no strict 'refs';  # we need symbolic refs for sub-names
use vars qw(@toons $basedir $exitcode $verby @months);

# verbosity should be set from the comand line!
$verby = 0; # 0=quiet(ish), 2=noisy

# basedir contains a directory for each cartoon fetched
# the HOME env var will be exported by cron automatically...
die "HOME is where the blackhole is?" unless $ENV{'HOME'};
$basedir = "$ENV{'HOME'}/public_html/cartoons";

# a list of "source:name". Each source must be defined as a sub FetchSource
@toons = ('UM:dilbert', 'UF:userfriendly');

# bits set: 1=fetch errors, 2=store errors, 4=config error
$exitcode = 0;

$| = 1; $_ = select STDERR; $| = 1; select $_;  # autoflush

@months = ('jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec');

# main loop calls FetchBLAH for each strip
# if @ARGV contains anything, then only named cartoons are fetched
unless (-d $basedir or (mkdir $basedir, 0755)) {
    print STDERR "mkdir: $basedir: $!\n";
    exit 2;
}
foreach (@ARGV) {
    print STDERR "Comic '$_' is not defined\n"
	unless (join '::', @toons, '') =~ m@\:$_\:\:@i  or lc($_) eq 'index';
}
foreach (@toons) {
    my ($source, $name) = /^([^:]+):([^:]+)$/;
    if (@ARGV and not grep /^$name$/i, @ARGV) {
	print STDERR "Skipping $name\n";
	next;
    }
    if (not defined $source or not defined $name) {
	print STDERR "Cartoon '$_' doesn't fit 'source:name'\n";
	$exitcode |= 4;
	next;
    }
    my $dir = "$basedir/$name";
    unless (-d $dir or (mkdir $dir, 0755)) {
	print STDERR "mkdir: $dir: $!\n";
	$exitcode |= 2;
	next;
    }
    &{"Fetch$source"} ($name, $dir)
	unless grep /^index$/i, @ARGV;

    # now update the index files
    BuildIndex($name, $dir, $source);
}


exit $exitcode;


################  ################  ################
#                fetchers for comics
#

sub FetchUM # args = (comic-name, target-directory)
# Fetch named comic from www.unitedmedia.com
# Dies sensibly on fetch failures
# Doesn't fetch things it already has
# Warns if the monthly archive has < 25 entries
{
# index page = http://www.unitedmedia.org/comics/COMIC/archive/
# individual pages = <option value="/comics/COMIC/archive/cal-\d{2}.html">Jan. 01         (was archive/COMICyymmdd.html)
# pictures = /comics/COMIC/archive/images/COMIC\d+.gif
#  (pictures have \d+ usually contains yymmdd but sometimes doesn't)

    my ($comic, $dir) = (shift, shift);
    my $html = FetchURL("http://www.unitedmedia.com/comics/$comic/archive/");
    unless (defined $html) {
	print STDERR "Fetch failed for United Media: $comic archive\n";
	$exitcode |= 1;
	return;
    }

    my %days;
    while ($html =~ m@=\s*"(/comics/$comic/archive/$comic-(\d+)\.html)"\s*>@gi) {
	$days{$2} = $1;
    }
    if (scalar keys %days < 25) {
	print STDERR "Fetch warning for United Media: $comic archive has only ",
	scalar keys %days, " entries\n";
#print STDERR "dump of days >>%days<<\n\n";
	$exitcode |= 1;
    }

    my $daypage;
    my $daycode;
    foreach $daycode (sort keys %days) {
	$daypage = $days{$daycode};
	if ($daycode !~ /^(\d{4})(\d{2})(\d{2})$/) {
	    print STDERR "Fetch warning United Media: can't parse date code '$daycode'\n";
	    $exitcode |= 1;
	    next;
	}
#TODO .. check it's a valid daycode!
	my $day = $daycode;

	# do we have it already?
	next if -f "$dir/$day.gif";

	# fetch the page for the day
	$html = FetchURL("http://www.unitedmedia.com$daypage");
	unless (defined $html) {
	    print STDERR "Fetch failed for United Media: $comic: daypage $day\n";
	    $exitcode |= 1;
	    next;
	}
	# extract a (list of) gifs for the day - should be just one
	my @gifs = $html =~ m@<img\s+src\s*=\s*"(/comics/$comic/archive/images/$comic\d+\.gif)"@gi;
	FetchGifs("$dir/$day", "United Media: $comic: $day", "http://www.unitedmedia.com", @gifs);
    }
}


sub FetchUF # args = (comic-name, target-directory)
# Fetch named comic from www.userfriendly.com
# Dies sensibly on fetch failures
# Doesn't fetch things it already has
{
# index page = http://www.userfriendly.org/cartoons/archives/
# individual pages = /cartoons/archive/YYmmm/YYYYMMDD.html
# pictures = /cartoons/YYmmm/\S+.gif
#  (later pictures have \S+ == YYYYMMDD but earlier ones look random, newer ones like uf000919.gif)

# Starts with today and goes backwards making URLs for the last $days days

# May or may not be Y2K compliant depending on what userfriendly.org do!

# Likely to find that "today's" cartoon is unavailable or out of date
# if we're in a different timezone

    my $days = 333;   # checks the last $days days files exist
    my ($comic, $dir) = (shift, shift);
    my ($month, $day);
    die "Can only fetch Userfriendly" unless lc($comic) eq "userfriendly";

    # timezone hours offset isn't really needed unless you want to
    # fetch as soon as the new one is available
    my $now = time() + 3600 * 0;

    for ( ; $days >= 0; $days--) {
	@_ = localtime($now - $days * 3600 * 24);
	$_[5] += 1900;
	$month = substr($_[5],2,2) . $months [$_[4]];
	$day = $_[5] . two($_[4]+1) . two($_[3]);

	if (not -f "$dir/$day.gif") {
	    print STDERR "Userfriendly '$day' ...\n" if $verby > 0;
	    my $html = FetchURL("http://www.userfriendly.org/cartoons/archives/$month/$day.html");
	    unless (defined $html) {
		print STDERR "Fetch failed for Userfriendly: $day's page\n";
		$exitcode |= 1;
		return;
	    }
	    print STDERR "... index ...\n" if $verby > 0;

	    my @gifs = $html =~ m@<img\s+[^>]*src\s*=\s*"(?:http://www.userfriendly.org)?(/cartoons/archives/\d{2}[abcdefgjlmnoprstuvy]{3}/\w+.gif)"@gi;
	    FetchGifs("$dir/$day", "Userfriendly: $day", "http://www.userfriendly.org", @gifs);
	    print STDERR "... and picture\n\n" if $verby > 0;
	} else { print STDERR "Userfriendly '$day' already exists\n" if $verby > 0 }
    }
}

sub two # arg = 0 <= number < 100
{
    return $_[0] if $_[0] > 9;
    return "0$_[0]";
}

################  ################  ################
#                 index generator
#


sub BuildIndex # args = (name, directory, cartoon-source)
# update html files for the directory
{
    my ($name, $dir, $src) = (shift, shift, shift);
    my $chunksize = 14; # days' cartoons per index-chunk

# rather inefficient, but we bin all previous index files at the moment
    foreach (glob "$dir/*.html") { unlink }

# for glob of *.gif ... guess dates and if any gifs -m after the -m of
# that (week, month)'s index then remake

    # find the oldest cartoon
    my $oldest = time();
    my @gifs = glob "$dir/*.gif";  # used again later
    print STDERR "Indexing $dir: ",(scalar @gifs)," items\n" if $verby;
    foreach (@gifs) {
	s@^$dir/@@i;
	die "Don't understand date format of $dir/$_"
	    unless /^(\d{2}|\d{4})(\d{2})(\d{2})(-\d+)?.gif$/i;
	my ($yr, $mn, $dy) = ($1, ($2)-1, $3); # $4 is just for multiple gifs on one day
	$yr += 1900 if $yr < 100;  # Y2K problem!
	my $then = timelocal(0, 0, 12, $dy, $mn, $yr);
	die "Time overflow!" if $then == -1;
	$oldest = $then unless $oldest < $then;
    }
    # work back from the oldest until we hit a Sunday (just for convenience)
    while ((localtime($oldest))[6] != 0) { $oldest -= 24 * 3600 }
    print STDERR "  Starting date: ",scalar localtime($oldest),"\n" if $verby;

    # Now work forwards making a indices for each $chunksize days and noting holes
    # Also keep an index of chunks in index.html
    die "Can't make index $dir/index.html"
	unless open IDX, ">$dir/index.html";
    print IDX "<html><head>
<meta http-equiv=\"pragma\" content=\"no-cache\">
<title>Index for \u$name</title></head>
<body><a href=\"#bot\">Jump to the bottom</a>
<table border=3 cellpadding=5 cellspacing=0>\n";


    my $then;
    for ($then=$oldest; $then <= time()+24*3600; $then += $chunksize * 24*3600) {
	my ($i, $remake, @toons) = ();
	my $present = 0;
	# 0 = none found,         1 = early ones missing but otherwise OK,
	# 2 = all present so far, 3 = holes exist
	for ($i = 0; $i < $chunksize; $i++) {
	    my $now = $then + $i * 3600 * 24;
	    @_ = localtime($now);
	    $_[5] += 1900;
	    my $day = $_[5] . two($_[4]+1) . two($_[3]);
	    my $noCEday = substr($day, 2);
	    my @daystoons = grep /^($day|$noCEday)(-\d+)?.gif$/i, @gifs;
	    if (@daystoons) {
		push @toons, [$now, @daystoons];
		if ($i == 0)          { $present = 2 }
		elsif ($present == 0) { $present = 1 }
	    } else {
		# missing one day
		push @toons, [$now];
		$present = 3 unless $present == 0;  # 
	    }
	}
	# write the index file for this chunk
	my $start;
	($_, $start) = BuildIndex_date($then);
	my $min_fetcheddays = 365000;  # Y2K (um, Y1M) 8-)
	if ($present > 0) {
	    die "Can't make index $dir/index$start.html"
		unless open CHKIDX, ">$dir/index$start.html";
	    my $nextfile = (BuildIndex_date($then + $chunksize * 3600 * 24))[1];
	    print CHKIDX "<html><head>
<meta http-equiv=\"pragma\" content=\"no-cache\">
<title>Index for \u$name : $_</title></head>
<body bgcolor=#ffffff><a name=\"top\"></a>";
	    print CHKIDX "<a href=\"index$nextfile.html#bot\">Next</a>\n"
		unless $then + $chunksize * 3600*24 > time();
	    print CHKIDX "<a href=\"index.html\">Return to index</a>
<table border=3 cellpadding=50 cellspacing=0>\n";
	    foreach (reverse @toons) {
		my $when = shift @$_;
		if ($when < time() or @$_ != 0) {
		    $when = scalar localtime($when);
		    $when =~ s/\s*(\d{2}:){2}\d{2}//;
		    print CHKIDX "<tr><td>$when<br>";
		    my $pic;
		    if (@$_) {
			foreach $pic (@$_) {
			    my $age = -M "$dir/$pic";
			    $min_fetcheddays = $age unless $min_fetcheddays < $age;
			    print CHKIDX "<img src=\"$pic\"> ";
			}
		    } else {
			print CHKIDX "<em>Missing</em>";
		    }
		    print CHKIDX "\n<a name=\"bot\"></a>" if $_ eq $toons[0];
		    print CHKIDX "</td></tr>\n";
		}
	    }
	    print CHKIDX "</table><a href=\"#top\">Up to top</a>\n</body></html>\n";
	    close CHKIDX;
	}
	print STDERR "  Chunk ",(scalar localtime($then))," : present = $present\n" if $verby;
	if ($present == 0) { print IDX "<tr><td>$_</td><td>"                                  }
	else               { print IDX "<tr><td><a href=\"index$start.html#bot\">$_</a></td><td>" }
	print IDX "State: ", ('<b>none</b>', 'first ones missing', 'OK', '<b>holes!</b>')[$present];
	$min_fetcheddays = int ($min_fetcheddays*10) / 10;
	if ($present == 0) {
	    print IDX "</td></tr>\n";
	} else {
	    print IDX "</td><td>Newest fetched $min_fetcheddays day".($min_fetcheddays==1?'':'s')." ago";
	    print IDX "<img src=\"../new.gif\" alt=\"[new!]\">"  if $min_fetcheddays < 3;
	    print IDX "</td></tr>\n";
	}
    }
    print IDX "</table><a href=\"..\">Up to index</a><a name=\"bot\"</a></body></html>\n";
    close IDX;

}


sub BuildIndex_date # arg = (time-since-epoch)
# returns human readable and compacted versions
{
    my $then = shift;
    @_ = localtime($then);
    $_[5] += 1900;
    my $start = "$_[3]$months[$_[4]]$_[5]";
    my $spaced = "$start";
    $spaced =~ s/^(\d+)([a-z]+)(\d+)$/$1 \u$2 $3/i;
    return ($spaced, $start);
}


################  ################  ################
#                implementation subs
#

sub DumpFile # args = (filename, binary-data)
# prints filename on success, warns and set exitcode on failure
{
    my ($fn, $data) = (shift, shift);

    if (not open F, ">$fn") {
	print STDERR "open: $fn: $!\n";
	$exitcode |= 2;
	return;
    }
    if (not print F $data) {
	print STDERR "print: $fn: unspecified failure (disk full?)\n";
	$exitcode |= 2;
    } else {
	print "$fn\n";
    }
    close F;
}


sub FetchURL # args = (URL)
# Returns document, or undef for failure
# Sends error details to STDERR
{
    my $url = shift;

    print STDERR "Fetching $url...\n"
	if $verby > 1;

#    my $buff = `lynx -mime_header \Q$url`;
#    my ($err_num, $err_str) = ($?, $!);


    # Create a user agent object
    use LWP::UserAgent;
    my $ua = new LWP::UserAgent;
    $ua->agent("ComicFetch/0.1 " . $ua->agent);
    $ua->env_proxy();

    my $req = new HTTP::Request GET => $url;	# Create a request
    my $res = $ua->request($req);		# Pass request to the user agent and get a response back

    # Check the outcome of the response
    unless ($res->is_success) {
	print STDERR "Failed fetching $url: ".($res->status_line())."\n";
	return undef;
    }
    my $buff = $res->content();
    my $headers = $res->headers_as_string();

    # check we didn't provoke the wwwoffled
    if ($headers =~ /wwwoffle/i) {
	print STDERR "comicfetch: wwwoffle intercepted: $url\n";
	return undef;
    }

#    # check we got an HTTP header
#    if (not $headers =~ /^HTTP  (?:\/[\d.]+)?  \s+  (\d+) (\s+.+)? \s*\n/x ){
#	$headers =~ s#[\n\r]#/#;  # kill the linefeeds
#	print STDERR "No HTTP fetch code for $url, headers='$headers'";
#	return undef;
#    }
#    if ($1 != 200) { # the HTTP answer code
#	print STDERR "Received 'HTTP $1 $2$3' for '$url' : headers = >>$headers<<\n";
#	return undef;
#    }

# extract the last-modified: date or whatever here...

    if ($headers =~ /^CONTENT-LENGTH:\s+(\d+)/im) {
	my $exp_len = $1;
	if ($exp_len != length $buff) {
	    print STDERR "Content of $url is ", (length $buff)," bytes but should be $exp_len\n";
#	    my $buff = `lynx -source \Q$url`;
#	    my ($err_num, $err_str) = ($?, $!);
#	    if ($? == 0 and length($buff) == $exp_len) {
#		$headers = "[no headers on second fetch]\n";
#		print STDERR "[but it looks OK on the second pass. Lynx is weird\n";
#	    } else {
		return undef;
#	    }
	}
    }

    if ($headers =~ m@^CONTENT-TYPE:\s+TEXT/HTML@im) {
	# fix the \r\n's
	$buff =~ s/\r\n/\n/g; # turn \r\n into \n  (DOS html files, ick)
	$buff =~ s/\r/\n/g;   # CRs are nasty anyway. Let's kill them!
    }

    print STDERR "Fetched $url\n"
	if $verby > 1;

    return $buff;
}


sub FetchGifs # args as in first line 8-)
# Fetch approximately one .gif, complain if more or less than one
{
    my ($fn, $title, $urlprefix, @gifs) = @_;

    unless (@gifs == 1) {
	print STDERR "Fetch warning for $title: ", scalar @gifs,
	" 'img src' items found\n";
	$exitcode |= 1 if @gifs == 0;
    }

    my $gif;
    for ($gif=0; $gif < @gifs; $gif++) {
	my $html = FetchURL("$urlprefix$gifs[$gif]");
	unless (defined $html) {
	    print STDERR "Fetch failed for $title: gif ",($gif+1),"\n";
	    $exitcode |= 1;
	    next;
	}
	# dump it out to disk
	DumpFile($fn .($gif!=0?"-$gif":''). '.gif', $html); # reports its own errors
    }
}

Repository owner

Powered by ViewCVS 1.0-dev
(Powered by ViewCVS)

ViewCVS and CVS Help