############################################################################ # # Wordpress.com 7 Day Referrer Log Parser by http://engtech.wordpress.com # # Thanks for your interest in this script, but I have to warn you that it # isn't intended for general usage or that well supported. I'm offering it # as a "Hey, this might be useful to you if you already know Perl/unix". # # How to run # # 1. Login to your wordpress.com blog and download the 7 Day Referrer page # to a file. This script might only work if the page is downloaded with # Firefox. I've also only tried having the downloaded page in the same # directory as the script. # # 2. Run the script giving the name of the downloaded page as the first # argument: # perl parse.pl week1.html # # 3. The first time the script is run with a new web page it will create # a CSV file. This file can be editted with a text editor or with # Microsoft Excel. When you first edit the file both columns will have # the same information. What you want to do is edit the second columns # and create "groups": # board.progaming.it,other articles # chris.pirillo.com,chris.pirillo.com # coolthingoftheday.blogspot.com,long tail # # 4. Re-run the script and it will create an HTML file (IE: week1_out.html) # with all of the URLs from the 7 Day Referrers page grouped as you # defined in the CSV file. # It will also create a second CSV file called "week1_graph.csv" that is # a list of the groups that are used for the first generated graph. # Any group that is not listed in this CSV file will be put on the second # generated graph. # # 5. The process I use is: # - Run the script to generate the CSV file # - Look at the output and create groups where it makes sense # - Keep re-running the script and editting the CSV until I like way the # HTML output is organized. # - Split the graphs the way I want to. # # 6. I'm sorry the graphs are so friggin' ugly. I need to read a tutorial # on "How to not make GD::Graph output shit." # ############################################################################ use strict; use warnings; use Data::Dumper; use GD::Graph::lines; use FileHandle; my $has_csv = 0; my $has_csv_graph = 0; my @data = (); my $data_idx = -1; my %sites = (); my %urls = (); my %translate = (); my %totalsFromSite = (); my $file = $ARGV[0]; if (! -f $file) { die "could not find file '$file': $!"; } my $prefix = $file; $prefix =~ s/\..*$//; my $csv_file = $prefix . ".csv"; my $csv_graph_file = $prefix . "_graph.csv"; my @sites_high = (); my $graph_high = $prefix."_high.png"; my $graph_low = $prefix."_low.png"; readCSV(); input(); if (not $has_csv) { writeCSV(); } output(); if ($has_csv && not $has_csv_graph) { writeCSVGraph(); } if ($has_csv && $has_csv_graph) { generateGraph(); } exit(0); sub readCSV { if (-r $csv_file) { $has_csv = 1; my $ifh = new FileHandle($csv_file, "r") || die "could not read file '$csv_file': $!"; while(<$ifh>) { chomp; my @data = split(/,/, $_); $translate{$data[0]} = $data[1]; } close($ifh); # print Dumper(%translate); } if (-r $csv_graph_file) { $has_csv_graph = 1; my $ifh = new FileHandle($csv_graph_file, "r") || die "could not read file '$csv_graph_file': $!"; while(<$ifh>) { chomp; @sites_high = split(/,/, $_); } close($ifh); # print Dumper(@sites_high); } } sub writeCSV { my $ofh = new FileHandle($csv_file, "w") || die "could not write file '$csv_file': $!"; foreach my $s (sort keys %sites) { print $ofh "$s,$s\n"; } close ($ofh); print "Please edit $csv_file to create groups and rerun.\n"; } sub writeCSVGraph { my $text = ""; foreach my $s (sort keys %sites) { chomp($s); $text .= ",$s"; } $text =~ s/^,//; my $ofh = new FileHandle($csv_graph_file, "w") || die "could not write file '$csv_graph_file': $!"; print $ofh $text . "\n"; close ($ofh); print "Please edit $csv_graph_file to break groups into high and low for the graphs.\n"; } sub input { my $ifh = new FileHandle($file, "r") || die "could not read file '$file': $!"; while (<$ifh>) { # if (m/
/) { my %hash = (); push(@data, \%hash); $data_idx++; #print "New day\n"; } # if (m/ elsif (m/
digg.com/view/all/popular/today/page37
(.*?)(\/.*|)<\/a><\/td>(\d+)<\/td><\/tr>/) { my ($url, $site, $hits) = ($1, $2, $4); process($url, $site, $hits); } #
engtech.wordpress.com/tag/nokia-66824
(.*?)(\/.*|)<\/td>(\d+)<\/td><\/tr>/) { my ($url, $site, $hits) = ("$1$2", $1, $3); process($url, $site, $hits); } } close($ifh); } sub process { my ($url, $site, $hits) = @_; #print "$hits, $url\n"; my $ref = $data[$data_idx]; # Normalize site urls if (defined $translate{$site}) { $site = $translate{$site}; } $sites{$site} = $data_idx; # newest to oldest # Keep track of URLs per site if (not defined $urls{$site}) { my %hash = (); $urls{$site} = \%hash; } $urls{$site}{$url} = 1; # Keep count if (not defined $ref->{$site}) { $ref->{$site} = 0; } $ref->{$site} += $hits; } sub output { delete($sites{'REMOVE'}); delete($urls{'REMOVE'}); my $ofile = $prefix."_out.html"; my $ofh = new FileHandle($ofile, "w") || die "could not write '$ofile': $!"; print $ofh "\n"; print $ofh ""; for(my $i=$#data; $i>=0; $i--) { my $day = $#data - $i + 1; print $ofh ""; } print $ofh "\n"; my @text = (); my @totals = (); foreach my $site (sort {sortSites($a, $b)} keys %sites) { push(@text, ""); push(@totals, 0); } for(my $i=$#data; $i>=0; $i--) { my $index = 0; foreach my $site (sort {sortSites($a, $b)} keys %sites) { # Normalize if (not defined $data[$i]->{$site}) { $data[$i]->{$site} = 0; } my $value = $data[$i]->{$site}; $totals[$index] += $value; $text[$index] .= ""; $totalsFromSite{$site} = $totals[$index]; $index++; } } for(my $i=0; $i<=$#text; $i++) { print $ofh $text[$i] . "\n"; } print $ofh "
SiteDay $dayTotals
$site".$value."" . $totals[$i] . "
\n"; print $ofh "\n"; foreach my $site (sort {sortSites($a, $b)} keys %sites) { my $total = $totalsFromSite{$site}; my $url_text = "
    "; foreach my $url (sort keys %{$urls{$site}}) { $url_text .= "
  • $url
  • "; } $url_text .= "
"; print $ofh ""; } print $ofh "
$site$total$url_text
\n"; close($ofh); } sub sortSites { my ($a, $b) = @_; my $vala = $sites{$a}; my $valb = $sites{$b}; if ($vala == $valb) { return($a cmp $b); } else { return($valb <=> $vala); } } sub generateGraph { my @graph_high = (); my @graph_low = (); my $max_high = 0; my $max_low = 0; my $day = 0; foreach my $ref (reverse @data) { my @row_high = (); push(@row_high, $day); foreach my $site (@sites_high) { my $value = $ref->{$site}; if ($value > $max_high) { $max_high = $value; } push(@row_high, $value); delete($sites{$site}); } push(@graph_high, \@row_high); $day++; } image($graph_high, $max_high, \@graph_high, \@sites_high); $day = 0; foreach my $ref (reverse @data) { my @row_low = (); push(@row_low, $day); foreach my $site (sort {sortSites($a, $b)} keys %sites) { my $value = $ref->{$site}; if ($value > $max_low) { $max_low = $value; } push(@row_low, $value); } push(@graph_low, \@row_low); $day++; } my @legend = (); foreach my $site (sort {sortSites($a, $b)} keys %sites) { push(@legend, $site); } image($graph_low, $max_low, \@graph_low, \@legend); } sub image { my ($file, $max, $gref, $lref) = @_; my @graph = @{$gref}; my @legend = @{$lref}; my $gdata = GD::Graph::Data->new(); foreach my $row (@graph) { $gdata->add_point(@{$row}); } my @colours = ("black", "blue", "purple", "green", "red", "gray", "dgray"); my $chart = GD::Graph::lines->new(600,375); $chart->set_legend(@legend); $chart -> set_x_axis_font("/usr/X11R6/lib/X11/fonts/TTF/lusimbi.ttf", 10); $chart -> set_y_axis_font("/usr/X11R6/lib/X11/fonts/TTF/luximbi.ttf", 10); $chart -> set_x_label_font("/usr/X11R6/lib/X11/fonts/TTF/luximb.ttf", 12); $chart -> set_y_label_font("/usr/X11R6/lib/X11/fonts/TTF/luximb.ttf", 12); $chart -> set_legend_font("/usr/X11R6/lib/X11/fonts/TTF/luximbi.ttf", 10); $chart->set ( y_label => "Traffic", x_label => "Days", y_max_value => $max, line_width => 3, y_long_ticks => 1, dclrs => [@colours] ); open(IMAGE, ">$file") or die "Cannot open $file output png file for writing: $!"; print IMAGE $chart->plot($gdata)->png; close IMAGE; }