#!/usr/bin/env perl # CVS: $Id: ljsm.pl,v 2.14 2017/04/18 09:22:00 sasha Exp $ # Author: Alexander Nikolaev #perl2exe_info FileDescription=Utility for Livejournal.com backup #perl2exe_info ProductName=LJSM #perl2exe_info ProductVersion=2.14.0.0 #perl2exe_info FileVersion=2.14.0.0 #perl2exe_info LegalCopyright=GPL #perl2exe_info CompanyName=Alexander Nikolaev =head1 SYNOPSYS see HELP_MESSAGE() subroutine for usage summary =head1 SETUP I've tested this script with Windows Me/XP, ActiveState perl v. 5.6.0-5.8.0 with the following ppm modules installed: libwww-perl 5.48 =head1 TODO - [?] fix -d and -O interaction - [x] add charset pragma to index.html - [x] add command-line switch for image download - [x] add &usescheme=lynx to lj queries. - [x] proxy support - [-] usable pager - [x] better date range handling (command-line switch?), - [x] explicitly show "You must be logged in to view this protected entry" and "This journal is deleted." cases in the index file - [x] no comments download - [ ] remove javascript from downloaded files - [ ] generate windows help project (.hhp) file for downloaded journals =head1 LINKS http://www.offtopia.net/ljsm/ http://www.livejournal.com/talkread.bml?journal=ru_hitech&itemid=118529 http://www.livejournal.com/talkread.bml?itemid=122758&journal=ru_hitech http://www.livejournal.com/talkread.bml?journal=ru_hitech&itemid=158872 http://www.livejournal.com/talkread.bml?itemid=394253&journal=rulj http://www.livejournal.com/community/rulj/532637.html http://www.livejournal.com/community/rulj/595146.html http://www.livejournal.com/community/rulj/854727.html http://www.livejournal.com/community/lj_clients/133229.html http://www.livejournal.com/community/lj_clients/197260.html =head1 SUBROUTINES =cut use constant { LOGIN => '', # leave it empty if you don't want to login PASSWORD => '', START_YEAR => 2001, # fetch data back to this year UTF8_DECODE => 0, # convert text to local charset LOCAL_CHARSET => 'windows-1251', # windows cyrillic DEBUG_LEVEL => 3, # 0 - quiet, 1 - essential, 2 - verbose LOCAL_DIR => '', # local directory to put files into. Leave it empty to put in the current directory. Slash (/, if not empty) in the end required. HTTP_PROXY => '', # set proxy URL if you use http proxy CLIENT => 'Perl-ljsm/2.14; variomap@gmail.com', CVSVERSION => '$Revision: 2.14 $', # don't touch this SAVE_PICS => 1, # download standard icons (1) usepics (2) or all graphics referenced by post (3) BASE_DOMAIN => 'livejournal.com' #'lj.rossia.org' }; # =================================================================== # end of public constants definition. no user-editable parts below this line # =================================================================== use constant BASE_URL => 'https://www.' . BASE_DOMAIN . '/'; use constant { MAX_TRIES => 5, # max tries to get page in case of failure CATALOG_URL => BASE_URL . 'view/?type=month', LOGIN_SCRIPT => 'login.bml', POST_SCRIPT => 'talkread.bml', POST_SCRIPTNOC => 'talkpost.bml', MEMO_SCRIPT => 'tools/memories.bml', EXPORT_SCRIPT => 'export.bml', INTERFACE => BASE_URL . 'interface/flat', BROKEN_CLIENT_INTERFACE => 1 # sessiongenerate call does not return nessesary cookies }; #use Data::Dumper; #use Carp; use local::lib; use LWP::UserAgent; #perl2exe_include LWP::Protocol::https #perl2exe_include URI::https #perl2exe_include PerlIO #perl2exe_include PerlIO::scalar use HTTP::Cookies; use File::Path; use File::Basename; use File::Find; use Compress::Zlib; use Digest::MD5 qw(md5_hex); use Getopt::Std; $Getopt::Std::STANDARD_HELP_VERSION = 1; use strict; my ($ua, $req, $res, $login, @posts, %images, $user, %users, %stat, %memories, %posts, $umask); our ($opt_r, $opt_m, $opt_a, $opt_c, $opt_O, $opt_i, $opt_I, $opt_u, $opt_U, $opt_x, $opt_t, $opt_p, $opt_d, $opt_L); # open log file (delete it if there were no errors) $umask = umask 0077; open LF, ">>ljsm.log" or die "error opening ljsm.log for appending: $!\n"; umask $umask; print LF "\n============= " . join(' ', @ARGV) . "\n"; print LF scalar localtime() . "\n"; # steal options from @ARGV before we go for users getopts('rmacxtOIULu:p:d:i:'); $opt_i = SAVE_PICS if (!$opt_i); HELP_MESSAGE() && exit unless (@ARGV); # rebuild indexes and exit if -x option is set if ($opt_x) { foreach $user (@ARGV) { # for each user %users = (); logmsg("rebuilding index file for user $user...\n"); build_index($user); logmsg(" done.\n"); } exit 0; } # init global vars $stat{$_} = 0 foreach ('users','pages_ok','got_posts','images'); %images = (); $ua = new LWP::UserAgent; $ua->agent(CLIENT); $ua->cookie_jar(new HTTP::Cookies( # file => "ljcookies.txt", autosave => 0) ); push @{ $ua->requests_redirectable }, 'POST'; # LWP communication logging # previously done by LWP::Debug # $ua->add_handler("request_send", sub { shift->dump; return }); # $ua->add_handler("response_done", sub { shift->dump; return }); # set proxy URL for LWP requests $ua->proxy('http', HTTP_PROXY) if HTTP_PROXY; if ($opt_p) { $opt_p = "http://$opt_p" unless ($opt_p =~ m{http://}); $ua->proxy('http', $opt_p); } # get cookies exit 1 unless (!(LOGIN || $opt_u) || ($login = lj_login())); # get posts and memories foreach $user (@ARGV) { # for each user %memories = %posts = %users = (); @posts = (); $stat{'count_posts'} = $stat{'count_memos'} = 0; logmsg("\n\n=== processing user $user\n"); push @posts, get_memos($user) if ($opt_m || $opt_a); push @posts, get_posts($user) unless ($opt_m && !$opt_a); get_files($user); build_index($user) if @posts; undef @posts; # free memory $stat{'users'}++; } # get images if (($stat{'got_posts'} > 0) && (scalar keys %images) && $opt_i) { get_pics(); } # ============================================ # subroutines # ============================================ # http://www.livejournal.com/talkread.bml?journal=ru_hitech&itemid=118529 # http://users.livejournal.com/_a_/2001/05/ (calendar view) # http://ati.livejournal.com/2001/05/ (calendar view 2) # http://www.livejournal.com/tools/memories.bml?user=_a_ (memories) # http://ivan-da-marya.livejournal.com/40324.html (post, user with underscores) # http://users.livejournal.com/_a_/570.html (post, user with uderscores -2 ) # http://community.livejournal.com/lj_dev/1234.html (post in comunity) # http://community.livejournal.com/rulj/862105.html (post in community without underscores) # http://users.livejournal.com/4x-/189231.html # (same as http://rulj.livejournal.com/862105.html) # # link type may be # P - post # Q - post in community # M - memories # C - month view for calendar # sub parse_link { my ($link) = @_; my ($link_type, $user, $post_id); my %url_parts = ( '/tools/memories.bml\?user=([-\w]+)' => 'M', '/users/([-\w]+)/(\d+).html' => 'P', '/~?([-\w]+)/(\d+).html' => 'P', '/\d+.html' => ['P', "^https?://([-\\w]+)\\.@{[BASE_DOMAIN]}/(\\d+)"], '/community/([-\w]+)/(\d+).html' => 'Q', '/[-\w]+/\d+.html' => ['X', "^https?://(?:users|community)\\.@{[BASE_DOMAIN]}/([-\\w]+)/(\\d+)"], '/users/([-\w]+)/\d{4}/\d{2}/' => 'C', '/([-\w]+)/\d{4}/\d{2}/' => 'C', '/~([-\w]+)/\d{4}/\d{2}/' => 'C', '/\d{4}/\d{2}/' => ['C', "^https?://([-\\w]+)\\.@{[BASE_DOMAIN]}"], '/talkread.bml\?journal=([-\w]+)&itemid=(\d+)' => 'P' ); $link_type = ''; foreach my $part (keys %url_parts) { if ($link =~ m#@{[BASE_DOMAIN]}$part#) { if (ref($url_parts{$part})) { # match against second regexp $link_type = $url_parts{$part}->[0]; $link =~ m#$url_parts{$part}->[1]#; if ($link_type eq 'X') { # user or community post? $link_type = ($link =~ m#https?://community#)? 'Q' : 'P'; } $user = $1; $post_id = $2; } else { $link_type = $url_parts{$part}; $user = $1; $post_id = $2; } last; } } $user =~ s/-/_/g if (defined $user); return ($link_type, $user, $post_id); } sub make_link { my ($link_type, $user, $post_id) = @_; #warn "make_link: $link_type, $user, $post_id\n"; return undef unless defined $user; (print("INTERNAL ERROR: don't know how to make links of type $link_type\n") && return undef) unless $link_type =~ /[PQ]/; my $prefix = 'http://'; if ($user =~ /^_/) { $prefix .= ($link_type eq 'Q')? "community.@{[BASE_DOMAIN]}" : "users.@{[BASE_DOMAIN]}"; $prefix .= "/$user"; } else { $user =~ s/_/-/g; $prefix .= "$user.@{[BASE_DOMAIN]}"; } $prefix .= "/$post_id.html"; return $prefix; } =item get_date_range($user) get year and month of the last downloaded post =cut sub get_date_range { my ($user, $is_xml) = @_; my ($start_year, $start_month, $end_year, $end_month, @date, $t); @date = localtime(); # get end date if ($opt_d) { ($start_year, $start_month, $end_year, $end_month) = split(/\D/, $opt_d); $end_year = $date[5]+1900 unless $end_year; $end_month = $date[4]+1 unless $end_month; # swap dates if specified in reversed order if ($start_year > $end_year) { ($start_year, $end_year, $start_month, $end_month) = ($end_year, $start_year, $end_month, $start_month); } elsif (($start_year == $end_year) && ($start_month > $end_month)) { ($start_month, $end_month) = ($end_month, $start_month); } return ($start_year, $start_month, $end_year, $end_month); } else { $start_year = START_YEAR; $start_month = 1; $end_year = $date[5] + 1900; $end_month = $date[4] + 1; } # set start_year, start_month based on the downloaded posts if (!(-d LOCAL_DIR . $user) || $opt_O || $opt_r) { return ($start_year, $start_month, $end_year, $end_month); } if (!$is_xml) { # date range between last post and current month opendir(UD, LOCAL_DIR . $user) or die "error opening " . LOCAL_DIR . "$user directory for reading: $!\n"; my ($year) = sort {$b <=> $a } grep(/^\d+$/, readdir(UD)); closedir UD; return ($start_year, $start_month, $end_year, $end_month) unless $year; opendir(UD, LOCAL_DIR . "$user/$year") or die "error opening " . LOCAL_DIR . "$user/$year directory for reading: $!\n"; my ($month) = sort {$b <=> $a } grep(/^\d+$/, readdir(UD)); closedir UD; $month = 1 unless $month; return ($year, $month, $end_year, $end_month); } else { # date range for XML export opendir(UD, LOCAL_DIR . $user . '/export') or die "error opening " . LOCAL_DIR . "$user/export directory for reading: $!\n"; my ($lastfile) = reverse sort grep (/^\d+_\d+\.xml$/, readdir UD); closedir UD; if ($lastfile && ($lastfile =~ /^(\d+)_(\d+)/)) { $start_year = $1; $start_month = $2; } return ($start_year, $start_month, $end_year, $end_month); } } =item get_pics() download userpics, buttons etc =cut sub get_pics { my ($imgsrc, $img); logmsg("getting pictures...\n",2); foreach $imgsrc (keys %images) { # test if there is already image with the same name next if (-f $images{$imgsrc}); # get image if ($img = get_page($imgsrc, 1)) { mkpath(dirname($images{$imgsrc}), DEBUG_LEVEL, 0755) unless -d dirname($images{$imgsrc}); if (open (DF, ">$images{$imgsrc}")) { binmode DF; print DF $img; close DF; $stat{'images'}++; } else { logmsg("error opening $images{$imgsrc} for writing: $!\n",0); } } else { logmsg("error getting $imgsrc\n",0); } } } =item get_memos($user) get list of user's memories and store them is $posts{memos} =cut sub get_memos { my ($user) = @_; my($content, $amuser, $keyword); my (@memos, $link, $link_post); logmsg("getting list of memories...\n",2); # get list of keywords if ($content = get_page(BASE_URL . MEMO_SCRIPT . "?user=$user")) { foreach $link (&tiny_link_extor(\$content, 0)) { next unless $link =~ /@{[MEMO_SCRIPT]}\?user=[-\w]+\&keyword=(.*?)\&filter=all$/; $keyword = $1; $keyword = " " unless length $keyword; # unescape keywords $keyword =~ s/\+/ /g; $keyword =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; $keyword = from_utf8($keyword) if (UTF8_DECODE || $opt_U); # get list of posts for the given keyword $link = BASE_URL . $link unless ($link =~ /^@{[BASE_URL]}/); if ($content = get_page($link)) { my ($link_type, $post_id); foreach $link_post (&tiny_link_extor(\$content, 0)) { ($link_type, $amuser, $post_id) = &parse_link($link_post); next unless $link_type =~ /[PQ]/; next unless ($opt_O || ! -f LOCAL_DIR . "$user/memories/$amuser\_$post_id.html"); push @memos, { 'type' => 'memo', 'link_type' => $link_type, 'status' => 0, 'amuser' => $amuser, 'keyword' => $keyword, 'link' => $link_post, 'itemid' => $post_id }; } } else { # error fetching list of posts logmsg("error fetching list of posts for user $user, keyword $keyword",0); } } } else { # error fetching list of keywords logmsg("error fetching list of keywords for $user\n",0); } return @memos; } =item get_posts($user) get list of user's posts and store them in $posts{posts} =cut sub get_posts { my ($user) = @_; my ($content, $year, $month, @posts, $link, $emonth, $itemid, $link_type, $post_id, $amuser); @posts = (); my ($start_year, $start_month, $end_year, $end_month) = get_date_range($user); logmsg("getting posts links for $user " . sprintf("[ %4d/%02d - %4d/%02d ]", $start_year,$start_month,$end_year,$end_month) . "\n"); $year = $end_year; YEAR: while ($year >= $start_year) { $emonth = ($year == $start_year)? $start_month : 1; for ($month = 12; $month >= $emonth; $month--) { next if (($year == $end_year) && ($month > $end_month)); #fetch catalog data if ($content = get_page(CATALOG_URL . "&user=$user&y=$year&m=$month")) { # process links. foreach $link (reverse sort &tiny_link_extor(\$content, 0)) { ($link_type, $amuser, $post_id) = &parse_link($link); next unless $link_type =~ /[PQ]/; push @posts, { 'type' => 'post', 'link_type' => $link_type, 'status' => 0, 'year' => $year, 'month' => $month, 'link' => $link, 'itemid' => $post_id, 'amuser' => $user, 'keyword' => '', 'comments' => ($content =~ m#$post_id\.html\D+(\d+)\s+repl#)? $1 : 0 }; } # link loop on the catalog page } else { # error fetching catalog data logmsg("!! Error fetching catalog data. Going on with " . @posts . " posts\n"); return @posts unless $opt_I; } } # months loop $year--; } # years loop return @posts; } =item get_files($user) download and process posts and memories. =cut sub get_files { my ($user) = @_; my ($post, $dir, $fname, $result, $extor, $up, $navbar, $n, $myhref); my $charset = "" : "charset=utf-8\">"; #print "going to get " . (scalar @posts) . " posts.\n"; logmsg((@posts)? "getting " . @posts . " posts...\n" : "No new posts to download.\n"); # print Dumper(\@posts); foreach $post (@posts) { if ($post->{'type'} eq 'post') { $dir = LOCAL_DIR . "$user/$post->{year}/$post->{month}"; $fname = "$post->{itemid}.html"; $up = "../../.."; } else { # memo $dir = LOCAL_DIR . "$user/memories"; $fname = "$post->{amuser}_$post->{itemid}.html"; $up = "../.."; } if (-s "$dir/$fname") { if ($opt_O) { logmsg("!! overwriting $dir/$fname\n", 2); } elsif ($opt_r) { logmsg("-r: skipping $dir/$fname\n", 2); next; } else { last; } } # old scheme if ($post->{'link'} =~ m#(journal=\w+&itemid=\d+)#) { $myhref = POST_SCRIPT . "?$1"; $post->{'link'} =~ s/@{[POST_SCRIPT]}/@{[POST_SCRIPTNOC]}/ if ($opt_c); } # new scheme if ($post->{'link'} =~ m#/(\d+)\.html#) { $myhref = "$1.html"; $post->{'link'} .= '?mode=reply' if ($opt_c); } # User can get a link for the hidden post, for example trying to fetch some other's # memories. In this case ljsm should continue fetching other posts, not panic on error my ($should_continue_on_error, $content) = get_page($post->{'link'}, 0); if ($content) { $stat{'got_posts'}++; mkpath($dir, DEBUG_LEVEL, 0755); &cleanup_html(\$content, $myhref, $user); &rewrite_imgsrc(\$content, $up, $user) if ($opt_i); $content = from_utf8($content) if (UTF8_DECODE || $opt_U); $content =~ s//\n\n$charset/; logmsg(">> $dir/$fname\n",2); open DF,">$dir/$fname" or die "error opening $dir/$fname for writing: $!\n"; print DF $content; close DF; $post->{'status'} = 1; } else { # error fetching page print "error fetching " . $post->{'link'} . "\n"; last unless $opt_I || $should_continue_on_error; } } } =item rewrite_imgsrc(\$page, $up) rewrite img's src attribute depending on $opt_i =cut sub rewrite_imgsrc { my ($page, $up, $user) = @_; my ($src, $d1, $d2); logmsg("extracting img src's...\n", 4); # list of unique image src's %images = (%images, map {$_ => 1} &tiny_link_extor($page, 1)); ($d1, $d2) = split('\.', BASE_URL, 2); foreach $src (keys %images) { if ($src =~ m#stat\.${d2}img/(.*)$#) { if ($opt_i > 0) { $d1 = $1; $$page =~ s#src=["']\Q$src\E['"]#src='$up/img/$d1'#sg; $images{$src} = "img/$d1"; } else { delete $images{$src}; } } elsif ($src =~ m#userpic\.$d2(\d+/\d+)$#) { if ($opt_i > 1) { $d1 = $1; $$page =~ s#src=['"]\Q$src\E['"]#src='$up/userpic/$d1'#sg; $images{$src} = "userpic/$d1"; } else { delete $images{$src}; } } elsif ($opt_i > 2) { $d1 = $src; $d1 =~ s#^https?://##; $d1 =~ s#[^(\w|\/|\.)]#_#g; $$page =~ s#src\s*=\s*['"]\Q$src\E['"]#src='$up/$user/img/$d1'#sg; $images{$src} = "$user/img/$d1"; } else { delete $images{$src}; } } } sub cleanup_html { my ($page, $myhref, $user) = @_; my ($result, $in_navbar, $in_reply, %links, $rlink); $in_navbar = 1; $in_reply = 0; $result = ''; foreach (split(/\n/, $$page)) { $in_reply = 0 if (m##i); next if $in_reply; $in_reply = 1 if ($opt_c && m#Read comments#); s#

.*##i if $in_reply; next if /^

\[ \]<\/p>$/; # add css link and keywords hook s##\n#i; # remove action buttons and forms # (delete, screen, mass action etc) s###g; s###g; s###g; s###g; s###g; s#\s*##g; s#

Mass action.*$##g; s###g; $result .= "$_\n"; } # safety net for runaway regexps if (length($result) == 0) { logmsg("** Failed to parse '" . $myhref . "', saving original HTML page\n"); $result = $$page; } # replace relative hrefs with absolute %links = map {$_ => 1} &tiny_link_extor(\$result, 0); my $prefix = 'http://'; if ($user =~ /^_/) { $prefix .= "users.@{[BASE_DOMAIN]}/$user"; } else { $user =~ s/_/-/g; $prefix .= "$user.@{[BASE_DOMAIN]}"; } foreach $rlink (keys %links) { $rlink =~ s#^/[^/]##; $result =~ s#href=(['"])\Q$rlink\E\1#href="$prefix/$rlink"#sg } $$page = $result; 1; } =item lj_login() login to server, get cookies =cut sub lj_login { logmsg("logging in to " . BASE_URL . "... \n", 1); my ($user, $password) = ((defined $opt_u) && (length $opt_u > 0))? split(":", $opt_u, 2) : (LOGIN, PASSWORD); my ($status1, $lj1) = &lj_interface_query( { 'mode' => 'getchallenge' } ); if (!$status1) { my ($status2, $lj2) = &lj_interface_query( { 'mode' => 'sessiongenerate', 'user' => $user, 'auth_method' => 'challenge', 'auth_challenge' => $lj1->{'challenge'}, 'auth_response' => md5_hex($lj1->{'challenge'} . md5_hex($password)), 'expiration' => 'long', 'ipfixed' => 1 } ); if (!$status2) { my $isok = 0; if (BROKEN_CLIENT_INTERFACE) { # go get cookies to /login.bml my ($status1, $lj1) = &lj_interface_query( { 'mode' => 'getchallenge' } ); if (!$status1) { my $req = new HTTP::Request(POST => BASE_URL . LOGIN_SCRIPT); $req->content_type('application/x-www-form-urlencoded'); my $content = "chal=".$lj1->{'challenge'} . "&response=".md5_hex($lj1->{'challenge'} . md5_hex($password)) . "&user=$user" . "&password=" . "&action:login=Log in..."; $req->content($content); $res = $ua->request($req); $isok = $res->is_success; # carp Dumper $res; } } else { $ua->cookie_jar->set_cookie(undef, 'ljsession', $lj2->{'ljsession'}, '/', BASE_DOMAIN); $isok = 1; } if ($isok) { $ua->cookie_jar->set_cookie(undef, 'langpref', 'en_LJ/' . $lj1->{'server_time'}, '/', '.'.BASE_DOMAIN); $ua->cookie_jar->set_cookie(undef, 'BMLschemepref', 'lynx', '/', '.'.BASE_DOMAIN); $ua->cookie_jar->set_cookie(undef, 'CP', 'null*', '/', '.'.BASE_DOMAIN); logmsg("got LJ cookies.\n", 1); return 1; } else { logmsg('Error logging in to server.', 0); return undef; } } else { logmsg('Error logging in to server.', 0); return undef; } } else { logmsg('Error logging in to server.', 0); return undef; } } =item get_page($url) download page from the remote host =cut sub get_page { my ($url, $is_image) = @_; if (!$is_image) { $url .= ($url =~ /\?/)? '&format=light' : '?format=light' if ($url !~ /format=light/); $url .= '&style=mine'; } # FIXME: replace same-protocol URLs with http: for now. if ($url =~ m#^//#) { $url = 'http:' . $url; } logmsg("<< $url\n",2); $req = new HTTP::Request GET => $url; $req->header('Accept-Encoding' => 'gzip;q=1.0, *;q=0'); my ($trycount, $res); foreach (1 .. MAX_TRIES) { $trycount = $_; logmsg("retrying $url...\n", 0) if ($_ > 1); #send request $res = $ua->request($req); #process responce if ($res->is_client_error) { # 4xx codes logmsg("!! " . $res->status_line . ": Deleted/hidden entry? Skipping URL\n",0); last; } elsif ($res->is_success) { $stat{'pages_ok'}++; my $page_body = ($res->content_encoding && ($res->content_encoding =~ /gzip/))? Compress::Zlib::memGunzip($res->content) : $res->content; return (0, $page_body); } else { logmsg("!! $_. " . $res->status_line . ". retrying in 3 seconds...\n", 0); sleep 3; } } $stat{'pages_err'}++; logmsg("!! failed to get $url after $trycount attempt" . (($trycount > 1)? '': 's') . "\n", 0); # save failed downloads to log file print LF "Failed: $url\n"; return ($res->is_client_error, undef); # first value means "page-specific error, outer loop should continue fetching other pages" } sub logmsg { my ($message, $loglvl) = @_; if (!defined $loglvl) { print $message; } else { #carp $message if ($loglvl <= DEBUG_LEVEL); print $message if ($loglvl <= DEBUG_LEVEL); } } =item build_index($user) build index file for the given user =cut sub build_index { my ($user) = @_; my ($month, $year, @months); @months = ('','January','February','March','April','May','June', 'July','August','September','October','November','December'); # skip to next dir if there is no such user unless (-d LOCAL_DIR . $user) { logmsg(LOCAL_DIR . $user . " not found."); return; } # traverse directory tree calling process_html for each file found find({ wanted => \&process_html_file, preprocess => \&sort_directory }, LOCAL_DIR . $user); # write index.html open DF, ">" . LOCAL_DIR . $user . "/index.html" or die "error opening " . LOCAL_DIR . $user . "/index.html" . "for writing: $!\n"; my $charset = (UTF8_DECODE || $opt_U)? LOCAL_CHARSET : 'utf-8'; print DF < Index file for $user livejournal


$user's livejournal.   EOH print DF "$stat{count_memos} memories " if (scalar keys %memories); if (scalar keys %posts) { print DF " | $stat{count_posts} posts: "; foreach (sort keys %posts) { # foreach year print DF "$_ "; } print DF "\n"; } print DF "
last updated: " . (scalar localtime) . "\n"; print DF '
' . "\n"; my ($postid, $title, $locallink, $key, $amuser, $itemid, $link, $metapost, $filename); if (scalar keys %posts) { foreach $year (reverse sort keys %posts) { # $posts{$year} is a reference to the hash of months # year header print DF "\n"; print DF '


' . "\n"; print DF '' . $year . ': '; print DF "" . $months[$_+0] . " | " foreach (sort {$a <=> $b} keys %{$posts{$year}}); print DF '

' . "\n"; # year body for $month (reverse sort {$a <=> $b} keys %{$posts{$year}}) { print DF "[ $months[$month] ]
\n"; for $metapost (@{$posts{$year}->{$month}}) { # make separate link if there is a link in a title (avoid nested 's) $postid = $metapost->{'itemid'}; $filename = $metapost->{'filename'}; $title = $metapost->{'title'}; $title = 'no title' unless ($title =~ /\S/); $locallink = (index($title, ' -1)? "[read]  $title" : "$title"; print DF "" . $metapost->{'day'} . " $locallink   | {'link_type'}, $metapost->{'amuser'}, $metapost->{'itemid'}) . "?usescheme=lynx\" target=\"_new\">»
\n"; print DF "

\n"; } } } } if (scalar keys %memories) { print DF ''. "\n"; print DF '


' . "\n"; print DF 'Memories: ' . "\n"; print DF '
' . "\n"; # # foreach keyword foreach $key (sort keys %memories) { print DF "
\n
$key
\n"; foreach $metapost (@{$memories{$key}}) { $amuser = $metapost->{'amuser'}; $itemid = $metapost->{'itemid'}; $title = $metapost->{'title'}; $filename = $metapost->{'filename'}; $title = 'no title' unless ($title =~ /\S/); $link = ($amuser)? &make_link($metapost->{'link_type'}, $amuser, $itemid) ."?usescheme=lynx" : "@{[POST_SCRIPT]}?itemid=$itemid&usescheme=lynx"; # make separate link if there is a link it title (avoid nested 's) $locallink = (index($title, ' -1)? "[read]  $title" : "$title"; print DF "
* $amuser:   $locallink   | »
\n"; } print DF "
\n"; } } print DF <

generated by ljsm @{[CVSVERSION]} EOE close DF or warn "Error closing index.html: $!\n"; #make_hhp() if ($^O =~ /win32/i); } # sort filenames so that the most recent posts go first sub sort_directory { return sort {$b cmp $a} grep (/\w/, @_); } # callback subroutine for build_index # sub process_html_file { my ($line, $link, $kw, $title, $description, $amuser, $itemid, $date, $locallink, $user, $metainfo, $is_utf8); return unless ($File::Find::dir =~ m#(\w+)/(\d{4}/\d{1,2}|memories)#); $user = $1; return unless (-s && /\.html$/); # $_ is set to file name and we are inside target directory open DF, "<$_" or die "Error opening $File::Find::name for reading: $!\n"; # search for link, keywords, title and date $title = ''; while ($line = ) { $kw = $1 if ($line =~ //); $title = $1 if ($line =~ m#(.*): $user#); $title = $1 if ($line =~ m#(.*?)#i); $title = $1 if ($line =~ m##); $title = "$1" if ($line =~ m#Error
(.*)$#i); $title = "$1" if ($line =~ m#^

Error

(.*)

$#i); $description = $1 if ($line =~ m##); $date = $1 if (!$date && $line =~ m#href="@{[BASE_URL]}users/\w+/day/\d\d\d\d/\d\d/(\d{1,2})"#); $date = $1 if (!$date && $line =~ m#href="@{[BASE_URL]}users/\w+/\d\d\d\d/\d\d/(\d{1,2})/"#); $date = $1 if (!$date && $line =~ m#href="https?://(?:[-\w]+\.)?@{[BASE_DOMAIN]}/(?:\w+/)?\d\d\d\d/\d\d/(\d{1,2})/"#); $users{$1}{$File::Find::name} = 1 if ($line =~ m#userinfo.bml\?user=(\w+)#); $is_utf8 = 1 if ($line =~ m##); } $date = sprintf("%02d. ", $date) if $date; $kw = 'default' unless $kw; $title = &from_utf8($title) if ($is_utf8 && (UTF8_DECODE || $opt_U)); $metainfo = { 'link_type' => 'P', 'filename' => $_, 'title' => $title, 'day' => $date, 'keywords' => $kw, 'description' => $description }; close DF or warn "Error closing $File::Find::name : $!\n"; if ($File::Find::dir =~ /memories/) { # memories $stat{'count_memos'}++; $_ =~ m#(\w*)_(\d+)\.html#; $metainfo->{'amuser'} = $1; $metainfo->{'itemid'} = $2; push @{$memories{$kw}}, $metainfo; } elsif ($File::Find::name =~ m#(\w+)/(\d{4})/(\d{1,2})/(\d+).html#) { # posts # $1 = user , $2 = year, $3 = month, $4 = itemid $_ = html file name $stat{'count_posts'}++; $metainfo->{'itemid'} = $4; $metainfo->{'amuser'} = $1; $posts{$2} = {$3 => []} if (!defined $posts{$2}); push @{$posts{$2}->{$3}}, $metainfo; } else { # html file in unknown directory. just do nothing } } #write files to compile windows help file for given user =cut sub make_hhp { my ($postid); # write main project file open DF, ">" . LOCAL_DIR . $user . "/$user.hhp" or die "error opening " . LOCAL_DIR . $user . "/$user.hhp" . "for writing: $!\n"; print DF <" . LOCAL_DIR . $user . "/TOC.hhc" or die "error opening " . LOCAL_DIR . $user . "/TOC.hhc" . "for writing: $!\n"; print DF <