[dw-free] Allow importing of your journal from another LiveJournal-based site.
[commit: http://hg.dwscoalition.org/dw-free/rev/6115e6b86caf]
http://bugs.dwscoalition.org/show_bug.cgi?id=114
Work on the comment import system. About 98% functional.
Patch by
mark.
Files modified:
http://bugs.dwscoalition.org/show_bug.cgi?id=114
Work on the comment import system. About 98% functional.
Patch by
![[staff profile]](https://www.dreamwidth.org/img/silk/identity/user_staff.png)
Files modified:
- cgi-bin/DW/Worker/ContentImporter.pm
- cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm
- cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm
- cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm
- cgi-bin/DW/Worker/ContentImporter/LiveJournal/FriendGroups.pm
- cgi-bin/DW/Worker/ContentImporter/Local/Comments.pm
- htdocs/misc/import.bml
-------------------------------------------------------------------------------- diff -r 89cf10bae98e -r 6115e6b86caf cgi-bin/DW/Worker/ContentImporter.pm --- a/cgi-bin/DW/Worker/ContentImporter.pm Tue Mar 10 15:34:40 2009 +0000 +++ b/cgi-bin/DW/Worker/ContentImporter.pm Wed Mar 11 05:36:29 2009 +0000 @@ -83,126 +83,7 @@ sub merge_watch { } -=head2 C<< $class->post_event( $user, $hashref, $comment ) >> - -$event is a hashref representation of a single comment, with the following format: - - { - subject => "Comment", - body => 'I DID STUFF!!!!!', - posterid => $local_userid, - - jitemid => $local_jitemid, - - parentid => $local_parent, - - state => 'A', - } - -=cut -sub insert_comment { - my ( $class, $u, $opts, $_comment ) = @_; - - my $errref; - - my $jitem = LJ::Entry->new( $u, jitemid=>$_comment->{jitemid} ); - my $user = undef; - my $source = $jitem->prop( "import_source" ) . "?thread=" . ( $_comment->{id} << 8 ); - $user = LJ::load_userid( $_comment->{posterid} ) if $_comment->{posterid}; - - my $date = $_comment->{date}; - $date =~ s/T/ /; - $date =~ s/Z//; - - my $comment = { - subject => $_comment->{subject}, - body => $_comment->{body}, - - state => $_comment->{state}, - u => $user, - - props => { - import_source => $source, - }, - - no_urls => 1, - no_esn => 1, - }; - my $item = { - itemid => $_comment->{jitemid}, - }; - my $parent = { - talkid => $_comment->{parentid}, - }; - - unless ($date) { - my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst ) = gmtime(); - $date = sprintf( "%4i-%2i-%2i %2i:%2i:%2i", 1900+$year, $mday, $mon, $hour, $min, $sec ); - } - - my $jtalkid = LJ::Talk::Post::enter_imported_comment( $u, $parent, $item, $comment, $date, \$errref ); - return undef unless $jtalkid; - return $jtalkid; -} - -=head2 C<< $class->get_comment_map( $user, $hashref ) - -Returns a hashref mapping import_source keys to jtalkids - -=cut -sub get_comment_map { - my ( $class, $u, $opts ) = @_; - return $opts->{talk_map} if $opts->{talk_map}; - - my $p = LJ::get_prop( "talk", "import_source" ); - return {} unless $p; - - my $dbr = LJ::get_cluster_reader( $u ); - my %map; - my $sth = $dbr->prepare( "SELECT jtalkid, value FROM talkprop2 WHERE journalid = ? AND tpropid = ?" ); - - $sth->execute( $u->id, $p->{id} ); - - while ( my ($jitemid,$value) = $sth->fetchrow_array ) { - $map{$value} = $jitemid; - } - - return \%map; -} - =head1 Helper Functions - -=head2 C<< $class->ratelimit_request( $hashref ) >> - -Imposes a ratelimit on the number of times this function can be called - -$hashref *must* be the same hash between calls, and must have a _rl_requests and _rl_seconds member. - -=cut -sub ratelimit_request { - my ( $class, $hashref ) = @_; - - # the next two lines load in the ratio - for example, a maximum of 4 requests in 1 second - my $num_requests = $hashref->{'_rl_requests'}; - my $num_seconds = $hashref->{'_rl_seconds'}; - - # $state is an arrayref containing timestamps - my $state = $hashref->{'_rl_delay_state'}; - if ( !defined( $state ) ) { - $state = []; - $hashref->{'_rl_delay_state'} = $state; - } - - my $now = time(); - push( @{$state}, $now ); - return if @{$state} < $num_requests; # we haven't done enough requests to justify a wait yet - - my $oldest = shift( @{$state} ); - if ( ( $now - $oldest ) < $num_seconds ) { - sleep( $num_seconds - ( $now - $oldest ) ); - } - return; -} =head2 C<< $class->import_data( $userid, $import_data_id ) >> diff -r 89cf10bae98e -r 6115e6b86caf cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm --- a/cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm Tue Mar 10 15:34:40 2009 +0000 +++ b/cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm Wed Mar 11 05:36:29 2009 +0000 @@ -203,19 +203,6 @@ sub retry_delay { return ( 10, 30, 60, 300, 600 )[$fails]; } -sub do_authed_fetch { - my ( $opts, $mode, $startid, $numitems, $sess ) = @_; - - # hit up the server with the specified information and return the raw content - my $ua = LWP::UserAgent->new; - my $request = HTTP::Request->new( GET => "http://$opts->{server}/export_comments.bml?get=$mode&startid=$startid&numitems=$numitems" ); - $request->push_header( Cookie => "ljsession=$sess" ); - my $response = $ua->request( $request ); - return if $response->is_error; - my $xml = $response->content; - return $xml if $xml; -} - ############################################################################## # MASON DIXON LINE \o/ # South of here, these functions have been updated with the changes Mark is @@ -334,12 +321,14 @@ sub get_remapped_userids { unless defined $oid; } - unless ( defined $fid ) { - warn "[$$] Remapping feed userid of $data->{hostname}:$user\n"; - $fid = $class->remap_username_feed( $data, $user ); - warn " FEED USERID IS STILL UNDEFINED\n" - unless defined $fid; - } +# FIXME: this is temporarily disabled while we hash out exactly how we want +# this functionality to work. +# unless ( defined $fid ) { +# warn "[$$] Remapping feed userid of $data->{hostname}:$user\n"; +# $fid = $class->remap_username_feed( $data, $user ); +# warn " FEED USERID IS STILL UNDEFINED\n" +# unless defined $fid; +# } $dbh->do( 'REPLACE INTO import_usermap (hostname, username, identity_userid, feed_userid) VALUES (?, ?, ?, ?)', undef, $data->{hostname}, $user, $oid, $fid ); @@ -423,18 +412,13 @@ sub remap_lj_user { } sub get_lj_session { - my $imp = $_[0]; + my ( $class, $imp ) = @_; - my $r = call_xmlrpc( $imp, 'sessiongenerate', { expiration => 'short' } ); + my $r = $class->call_xmlrpc( $imp, 'sessiongenerate', { expiration => 'short' } ); return undef unless $r && ! $r->{fault}; return $r->{ljsession}; -} - -sub d { - warn shift(@_) . "\n" - if $LJ::IS_DEV_SERVER; } sub xmlrpc_call_helper { diff -r 89cf10bae98e -r 6115e6b86caf cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm --- a/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm Tue Mar 10 15:34:40 2009 +0000 +++ b/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm Wed Mar 11 05:36:29 2009 +0000 @@ -20,6 +20,7 @@ use base 'DW::Worker::ContentImporter::L use base 'DW::Worker::ContentImporter::LiveJournal'; use Carp qw/ croak confess /; +use Time::HiRes qw/ tv_interval gettimeofday /; use DW::Worker::ContentImporter::Local::Comments; # these come from LJ @@ -40,215 +41,329 @@ sub try_work { my ( $class, $job ) = @_; my $opts = $job->arg; my $data = $class->import_data( $opts->{userid}, $opts->{import_data_id} ); + my $begin_time = [ gettimeofday() ]; # failure wrappers for convenience my $fail = sub { return $class->fail( $data, 'lj_comments', $job, @_ ); }; my $ok = sub { return $class->ok( $data, 'lj_comments', $job ); }; my $temp_fail = sub { return $class->temp_fail( $data, 'lj_comments', $job, @_ ); }; + my $status = sub { return $class->status( $data, 'lj_comments', { @_ } ); }; + + # logging sub + my ( $logfile, $last_log_time ); + my $log = sub { + $last_log_time ||= [ gettimeofday() ]; + + unless ( $logfile ) { + mkdir "$LJ::HOME/logs/imports"; + mkdir "$LJ::HOME/logs/imports/$opts->{userid}"; + open $logfile, ">>$LJ::HOME/logs/imports/$opts->{userid}/$opts->{import_data_id}.lj_comments.$$" + or return $temp_fail->( 'Internal server error creating log.' ); + print $logfile "[0.00s 0.00s] Log started at " . LJ::mysql_time(gmtime()) . ".\n"; + } + + my $fmt = "[%0.4fs %0.1fs] " . shift() . "\n"; + my $msg = sprintf( $fmt, tv_interval( $last_log_time ), tv_interval( $begin_time), @_ ); + + print $logfile $msg; + $job->debug( $msg ); + + $last_log_time = [ gettimeofday() ]; + }; # setup my $u = LJ::load_userid( $data->{userid} ) or return $fail->( 'Unable to load target with id %d.', $data->{userid} ); + $log->( 'Import begun for %s(%d).', $u->user, $u->userid ); - # temporary failure, this code hasn't been ported yet - return $fail->( 'oops, not ready yet' ); -} + # this will take a entry_map (old URL -> new jitemid) and convert it into a jitemid map (old jitemid -> new jitemid) + my $entry_map = DW::Worker::ContentImporter::Local::Entries->get_entry_map( $u ) || {}; + $log->( 'Loaded entry map with %d entries.', scalar( keys %$entry_map ) ); -1; -__END__ + # now backfill into jitemid_map + my $jitemid_map = {}; + foreach my $url ( keys %$entry_map ) { + my $jitemid = $1 >> 8 + if $url =~ m!/(\d+)\.html$!; + $jitemid_map->{$jitemid} = $entry_map->{$url}; + } + + # this will take a talk_map (old URL -> new jtalkid) and convert it to a jtalkid map (old jtalkid -> new jtalkid) + my $talk_map = DW::Worker::ContentImporter::Local::Comments->get_comment_map( $u ) || {}; + $log->( 'Loaded comment map with %d entries.', scalar( keys %$talk_map ) ); -### WORK GOES HERE -$opts->{identity_map} ||= {}; + # now reverse it as above + my $jtalkid_map = {}; + foreach my $url ( keys %$talk_map ) { + my $jtalkid = $1 >> 8 + if $url =~ m!thread=(\d+)$!; + $jtalkid_map->{$jtalkid} = $talk_map->{$url}; + } -# this will take a entry_map (old URL -> new jitemid) and convert it into a jitemid map (old jitemid -> new jitemid) -# TODO: Make sure you are dealing with the correct site. -unless ( $opts->{jitemid_map} ) { - $opts->{entry_map} ||= DW::Worker::ContentImporter->get_entry_map($u,$opts); - $opts->{jitemid_map} = {}; - foreach my $url ( keys %{$opts->{entry_map}} ) { - next unless $url =~ m/$opts->{user_path}/; - my ($ditemid) = $url =~ m/\/([0-9]+)\.html?$/; - my $jitemid = $ditemid >> 8; - $opts->{jitemid_map}->{$jitemid} = $opts->{entry_map}->{$url}; + # parameters for below + my ( %meta, @userids, $identity_map ); + my ( $maxid, $server_max_id, $server_next_id, $lasttag ) = ( 0, 0, 1, '' ); + + # setup our parsing function + my $meta_handler = sub { + # this sub actually processes incoming meta information + $lasttag = $_[1]; + shift; shift; # remove the Expat object and tag name + my %temp = ( @_ ); # take the rest into our humble hash + + # if we were last getting a comment, start storing the info + if ( $lasttag eq 'comment' ) { + # get some data on a comment + $meta{$temp{id}} = { + id => $temp{id}, + posterid => $temp{posterid}+0, + state => $temp{state} || 'A', + }; + + } elsif ( $lasttag eq 'usermap' && ! exists $identity_map->{$temp{id}} ) { + push @userids, $temp{id}; + + my ( $local_oid, $local_fid ) = $class->get_remapped_userids( $data, $temp{user} ); + $identity_map->{$temp{id}} = $local_oid; + + $log->( 'Mapped remote %s(%d) to local userid %d.', $temp{user}, $temp{id}, $local_oid ); + } + }; + my $meta_closer = sub { + # we hit a closing tag so we're not in a tag anymore + $lasttag = ''; + }; + my $meta_content = sub { + # if we're in a maxid tag, we want to save that value so we know how much further + # we have to go in downloading meta info + return undef + unless $lasttag eq 'maxid' || + $lasttag eq 'nextid'; + + # save these values for later + $server_max_id = $_[1] + 0 if $lasttag eq 'maxid'; + $server_next_id = $_[1] + 0 if $lasttag eq 'nextid'; + }; + + # hit up the server for metadata + while ( defined $server_next_id && $server_next_id =~ /^\d+$/ ) { + $log->( 'Fetching metadata; max_id = %d, next_id = %d.', $server_max_id || 0, $server_next_id || 0 ); + + my $content = $class->do_authed_comment_fetch( + $data, 'comment_meta', $server_next_id, $COMMENTS_FETCH_META + ); + return $temp_fail->( 'Error fetching comment metadata from server.' ) + unless $content; + + $server_next_id = undef; + + # now we want to XML parse this + my $parser = new XML::Parser( + Handlers => { + Start => $meta_handler, + Char => $meta_content, + End => $meta_closer + } + ); + $parser->parse( $content ); } -} + $log->( 'Finished fetching metadata.' ); -# this will take a talk_map (old URL -> new jtalkid) and convert it to a jtalkid map (old jtalkid -> new jtalkid) -# TODO: Make sure you are dealing with the correct site. -unless ( $opts->{jtalkid_map} ) { - $opts->{talk_map} ||= DW::Worker::ContentImporter->get_comment_map( $u, $opts ); - $opts->{jtalkid_map} = {}; - foreach my $url ( keys %{$opts->{talk_map}} ) { - next unless $url =~ m/$opts->{user_path}/; - my ( $dtalkid ) = $url =~ m/\?thread=([0-9]+)$/; - my $jtalkid = $dtalkid >> 8; - $opts->{jtalkid_map}->{$jtalkid} = $opts->{talk_map}->{$url}; + # body handling section now + my ( $lastid, $curid, @tags ) = ( 0, 0 ); + + # setup our handlers for body XML info + my $body_handler = sub { + # this sub actually processes incoming body information + $lasttag = $_[1]; + push @tags, $lasttag; + shift; shift; # remove the Expat object and tag name + my %temp = ( @_ ); # take the rest into our humble hash + if ( $lasttag eq 'comment' ) { + # get some data on a comment + $curid = $temp{id}; + $meta{$curid}{parentid} = $temp{parentid}+0; + $meta{$curid}{jitemid} = $temp{jitemid}+0; + } + }; + my $body_closer = sub { + # we hit a closing tag so we're not in a tag anymore + my $tag = pop @tags; + $lasttag = $tags[0]; + }; + my $body_content = sub { + # this grabs data inside of comments: body, subject, date + return unless $curid; + return unless $lasttag =~ /(?:body|subject|date)/; + $meta{$curid}{$lasttag} .= $_[1]; + # have to .= it, because the parser will split on punctuation such as an apostrophe + # that may or may not be in the data stream, and we won't know until we've already + # gotten some data + }; + + # start looping to fetch all of the comment bodies + while ( $lastid < $server_max_id ) { + $log->( 'Fetching bodydata; last_id = %d, max_id = %d.', $lastid || 0, $server_max_id || 0 ); + + my $content = $class->do_authed_comment_fetch( + $data, 'comment_body', $lastid+1, $COMMENTS_FETCH_BODY + ); + return $temp_fail->( 'Error fetching comment body data from server.' ) + unless $content; + + # now we want to XML parse this + my $parser = new XML::Parser( + Handlers => { + Start => $body_handler, + Char => $body_content, + End => $body_closer + } + ); + $parser->parse( $content ); + + # the exporter should always return the maximum number of items, so loop again. of course, + # this will fail nicely as soon as some site we're importing from reduces the max items + # they return due to load. http://community.livejournal.com/changelog/5907095.html + $lastid += $COMMENTS_FETCH_BODY; } -} + + # now iterate over each comment and build the nearly final structure + foreach my $comment ( values %meta ) { -# downloaded meta data information -my %meta; -my @userids; - -# setup our parsing function -my $maxid = 0; -my $server_max_id = 0; -my $server_next_id = 1; -my $lasttag = ''; -my $meta_handler = sub { - # this sub actually processes incoming meta information - $lasttag = $_[1]; - shift; shift; # remove the Expat object and tag name - my %temp = ( @_ ); # take the rest into our humble hash - if ( $lasttag eq 'comment' ) { - # get some data on a comment - $meta{$temp{id}} = { - id => $temp{id}, - posterid => $temp{posterid}+0, - state => $temp{state} || 'A', - }; - } elsif ( $lasttag eq 'usermap' && !$opts->{identity_map}->{$temp{id}} ) { - push @userids, $temp{id}; - $opts->{identity_map}->{$temp{id}} = remap_username_friend( $opts, $temp{user} ); - } -}; -my $meta_closer = sub { - # we hit a closing tag so we're not in a tag anymore - $lasttag = ''; -}; -my $meta_content = sub { - # if we're in a maxid tag, we want to save that value so we know how much further - # we have to go in downloading meta info - return unless ( $lasttag eq 'maxid' ) || ( $lasttag eq 'nextid' ); - $server_max_id = $_[1] + 0 if ( $lasttag eq 'maxid' ); - $server_next_id = $_[1] + 0 if ( $lasttag eq 'nextid' ); -}; - -# hit up the server for metadata -while ( defined $server_next_id && $server_next_id =~ /^\d+$/ ) { - DW::Worker::ContentImporter->ratelimit_request( $opts ); - my $content = do_authed_fetch( $opts, 'comment_meta', $server_next_id, $COMMENTS_FETCH_META, $session ); - #die "Some sort of error fetching metadata from server" unless $content; - - $server_next_id = undef; - - # now we want to XML parse this - my $parser = new XML::Parser( Handlers => { Start => $meta_handler, Char => $meta_content, End => $meta_closer } ); - $parser->parse( $content ); -} - -# setup our handlers for body XML info -my $lastid = 0; -my $curid = 0; -my @tags; -my $body_handler = sub { - # this sub actually processes incoming body information - $lasttag = $_[1]; - push @tags, $lasttag; - shift; shift; # remove the Expat object and tag name - my %temp = ( @_ ); # take the rest into our humble hash - if ( $lasttag eq 'comment' ) { - # get some data on a comment - $curid = $temp{id}; - $meta{$curid}{parentid} = $temp{parentid}+0; - $meta{$curid}{jitemid} = $temp{jitemid}+0; - # line below commented out because we shouldn't be trying to be clever like this ;p - # $lastid = $curid if $curid > $lastid; - } -}; -my $body_closer = sub { - # we hit a closing tag so we're not in a tag anymore - my $tag = pop @tags; - $lasttag = $tags[0]; -}; -my $body_content = sub { - # this grabs data inside of comments: body, subject, date - return unless $curid; - return unless $lasttag =~ /(?:body|subject|date)/; - $meta{$curid}{$lasttag} .= $_[1]; - # have to .= it, because the parser will split on punctuation such as an apostrophe - # that may or may not be in the data stream, and we won't know until we've already - # gotten some data -}; - -# at this point we have a fully regenerated metadata cache and we want to grab a block of comments -while ( 1 ) { - DW::Worker::ContentImporter->ratelimit_request( $opts ); - my $content = do_authed_fetch( $opts, 'comment_body', $lastid+1, $COMMENTS_FETCH_BODY, $session ); - - # now we want to XML parse this - my $parser = new XML::Parser( Handlers => { Start => $body_handler, Char => $body_content, End => $body_closer } ); - $parser->parse( $content ); - - # now at this point what we have to decide whether we should loop again for more metadata - $lastid += $COMMENTS_FETCH_BODY; - last unless $lastid < $server_max_id; -} - -foreach my $comment ( values %meta ) { - $comment->{posterid} = $opts->{identity_map}->{$comment->{posterid}}; - $comment->{jitemid} = $opts->{jitemid_map}->{$comment->{jitemid}}; - - $comment->{unresolved} = 1 if ($comment->{parentid}); - - my $body = remap_lj_user($opts,$comment->{body}); - $body =~ s/<.+?-embed-.+?>//g; - $body =~ s/<.+?-template-.+?>//g; - $comment->{body} = $body; - - $comment->{orig_id} = $comment->{id}; - - if ($comment->{parentid} && $comment->{state} ne 'D') { - $meta{$comment->{parentid}}->{has_children} = 1; - } -} - -my @to_import = sort { ( $a->{id}+0 ) <=> ( $b->{id}+0 ) } values %meta; -my $had_unresolved = 1; -# This loop should never need to run through more then once -# but, it will *if* for some reason a comment comes before its parent -# which *should* never happen, but I'm handling it anyway, just in case. -while ($had_unresolved) { - $had_unresolved = 0; - my $ct = 0; - my $ct_unresolved = 0; - foreach my $comment (@to_import) { - next if $comment->{done}; # Skip this comment if it was already imported this round - next if $opts->{jtalkid_map}->{$comment->{orig_id}}; # Or on a previous import round - next if ( $comment->{state} eq 'D' && !$comment->{has_children} ); # Or if the comment is deleted, and child-less - $ct++; - if ( $comment->{unresolved} ) { - # lets see if this is resolvable at the moment - # A resolvable comment is a comment that's parent is already in the DW database - # and an unresolved comment is a comment that has a parent that is currently not in the database. - if ( $opts->{jtalkid_map}->{$comment->{parentid}} ) { - $comment->{parentid} = $opts->{jtalkid_map}->{$comment->{parentid}}; - $comment->{unresolved} = 0; - } - } - if ( $comment->{unresolved} ) { - $ct_unresolved++; - $had_unresolved = 1; + # if we weren't able to map to a jitemid (last entry import a while ago?) + # or some other problem, log it and bail + unless ( $jitemid_map->{$comment->{jitemid}} ) { + $comment->{skip} = 1; + $log->( 'NO MAPPED ENTRY: remote values: jitemid %d, posterid %d, jtalkid %d.', + $comment->{jitemid}, $comment->{posterid}, $comment->{id} ); next; } - my $talkid = DW::Worker::ContentImporter->insert_comment( $u, $opts, $comment ); - $opts->{jtalkid_map}->{$comment->{id}} = $talkid; - $comment->{id} = $talkid; - $comment->{done} = 1; + + # basic mappings + $comment->{posterid} = $identity_map->{$comment->{posterid}}; + $comment->{jitemid} = $jitemid_map->{$comment->{jitemid}}; + $comment->{orig_id} = $comment->{id}; + + # unresolved comments means we haven't got the parent in the database + # yet so we can't post this one + $comment->{unresolved} = 1 + if $comment->{parentid}; + + # the reverse of unresolved, tell the parent it has visible children + $meta{$comment->{parentid}}->{has_children} = 1 + if $comment->{parentid} && $comment->{state} ne 'D'; + + # remap content (user links) then remove embeds/templates + my $body = $class->remap_lj_user( $data, $comment->{body} ); + $body =~ s/<.+?-embed-.+?>/[Embedded content removed during import.]/g; + $body =~ s/<.+?-template-.+?>/[Templated content removed during import.]/g; + $comment->{body} = $body; } - # Sanity check. This *really* should never happen. - # This is here to prevent an endless loop, just in case. - # The only way I can see this firing is if a comment is just - # totally missing. - if ( $ct == $ct_unresolved && $had_unresolved ) { - # FIXME: Error - $had_unresolved = 0; # Set this to 0 so the loop falls through + + # variable setup for the database work + my @to_import = sort { ( $a->{id}+0 ) <=> ( $b->{id}+0 ) } values %meta; + my $had_unresolved = 1; + + # This loop should never need to run through more then once + # but, it will *if* for some reason a comment comes before its parent + # which *should* never happen, but I'm handling it anyway, just in case. + while ( $had_unresolved ) { + + # variables, and reset + my ( $ct, $ct_unresolved ) = ( 0, 0 ); + $had_unresolved = 0; + + # now doing imports! + foreach my $comment ( @to_import ) { + next if $comment->{skip}; + + $log->( "Attempting to import remote id %d, parentid %d, state %s.", + $comment->{orig_id}, $comment->{parentid}, $comment->{state} ); + + # rules we might skip a content with + next if $comment->{done}; # Skip this comment if it was already imported this round + next if $jtalkid_map->{$comment->{orig_id}}; # Or on a previous import round + next if $comment->{state} eq 'D' && !$comment->{has_children}; # Or if the comment is deleted, and child-less + + # now we know this one is going in the database + $ct++; + + # try to resolve + if ( $comment->{unresolved} ) { + # lets see if this is resolvable at the moment + # A resolvable comment is a comment that's parent is already in the DW database + # and an unresolved comment is a comment that has a parent that is currently not in the database. + if ( $jtalkid_map->{$comment->{parentid}} ) { + $comment->{parentid} = $jtalkid_map->{$comment->{parentid}}; + $comment->{unresolved} = 0; + + $log->( 'Resolved unresolved comment to local parentid %d.', + $comment->{parentid} ); + + } else { + # guess we couldn't resolve it :( next pass! + $ct_unresolved++; + $had_unresolved = 1; + + $log->( 'Failed to resolve comment.' ); + + next; + } + } + + # if we get here we're good to insert into the database + my $err = ""; + my $talkid = DW::Worker::ContentImporter::Local::Comments->insert_comment( $u, $comment, \$err ); + if ( $talkid ) { + $log->( 'Successfully imported source %d to new jtalkid %d.', $comment->{id}, $talkid ); + } else { + $log->( 'Failed to import comment %d: %s.', $comment->{id}, $err ); + return $temp_fail->( 'Failure importing comment: %s.', $err ); + } + + # store this information + $jtalkid_map->{$comment->{id}} = $talkid; + $comment->{id} = $talkid; + $comment->{done} = 1; + } + + # Sanity check. This *really* should never happen. + # This is here to prevent an endless loop, just in case. + # The only way I can see this firing is if a comment is just + # totally missing. + if ( $ct == $ct_unresolved && $had_unresolved ) { + $log->( 'The unthinkable happened! We hit an iceberg!!!' ); + $log->( 'The above error: %d == %d && %d (had_unresolved).', $ct, $ct_unresolved, $had_unresolved ); + return $fail->( 'Found unresolvable comment chain.' ); + } } -} -$opts->{no_comments} = 1; - + return $ok->(); } -1; \ No newline at end of file +sub do_authed_comment_fetch { + my ( $class, $data, $mode, $startid, $numitems ) = @_; + + # if we don't have a session, then let's generate one + $data->{_session} ||= $class->get_lj_session( $data ); + + # hit up the server with the specified information and return the raw content + my $ua = LWP::UserAgent->new; + my $request = HTTP::Request->new( GET => "http://www.$data->{hostname}/export_comments.bml?get=$mode&startid=$startid&numitems=$numitems" ); + $request->push_header( Cookie => "ljsession=$data->{_session}" ); + + # try to get the response + my $response = $ua->request( $request ); + return if $response->is_error; + + # now get the content + my $xml = $response->content; + return $xml if $xml; + + # total failure... + return undef; +} + + +1; diff -r 89cf10bae98e -r 6115e6b86caf cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm --- a/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm Tue Mar 10 15:34:40 2009 +0000 +++ b/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm Wed Mar 11 05:36:29 2009 +0000 @@ -99,8 +99,23 @@ sub try_work { $log->( ' retrieved %d items and %d left to sync', $hash->{count}, $hash->{total} ); last if $hash->{count} == $hash->{total}; } - $log->( 'Syncitems finished.' ); + $log->( 'Syncitems finished with %d items pre-prune.', scalar( keys %sync ) ); + # this is an optimization. since we never do an edit event (only post!) we will + # never get changes anyway. so let's remove from the list of things to sync any + # post that we already know about. (not that we really care, but it's much nicer + # on people we're pulling from.) + foreach my $url ( keys %$entry_map ) { + unless ( $url =~ m!/(\d+)\.html$! ) { + $log->( 'URL %s not of expected format in prune.', $url ); + next; + } + + delete $sync{$1 >> 8}; + } + $log->( 'Syncitems now has %d items post-prune.', scalar( keys %sync ) ); + + # simple helper sub my $realtime = sub { my $id = shift; return $sync{$id}->[1] if @{$sync{$id} || []}; @@ -212,6 +227,15 @@ sub try_work { $log->( ' counted %d entries, lastgrab is now %s.', $count, $lastgrab ); } + # mark the comments mode as ready to schedule + my $dbh = LJ::get_db_writer(); + $dbh->do( + q{UPDATE import_items SET status = 'ready' + WHERE userid = ? AND item IN ('lj_comments') + AND import_data_id = ? AND status = 'init'}, + undef, $u->id, $opts->{import_data_id} + ); + return $ok->(); } diff -r 89cf10bae98e -r 6115e6b86caf cgi-bin/DW/Worker/ContentImporter/LiveJournal/FriendGroups.pm --- a/cgi-bin/DW/Worker/ContentImporter/LiveJournal/FriendGroups.pm Tue Mar 10 15:34:40 2009 +0000 +++ b/cgi-bin/DW/Worker/ContentImporter/LiveJournal/FriendGroups.pm Wed Mar 11 05:36:29 2009 +0000 @@ -56,13 +56,14 @@ sub try_work { my $map = DW::Worker::ContentImporter::Local::TrustGroups->merge_trust_groups( $u, $r->{friendgroups} ); - # mark lj_friends item as able to be scheduled now, and save the map + # store the merged map $dbh->do( q{UPDATE import_data SET groupmap = ? WHERE userid = ? AND import_data_id = ?}, undef, nfreeze( $map ), $u->id, $opts->{import_data_id} ); + # mark lj_friends item as able to be scheduled now, and save the map # FIXME: what do we do on error case? well, hopefully that will be rare... $dbh->do( q{UPDATE import_items SET status = 'ready' diff -r 89cf10bae98e -r 6115e6b86caf cgi-bin/DW/Worker/ContentImporter/Local/Comments.pm --- a/cgi-bin/DW/Worker/ContentImporter/Local/Comments.pm Tue Mar 10 15:34:40 2009 +0000 +++ b/cgi-bin/DW/Worker/ContentImporter/Local/Comments.pm Wed Mar 11 05:36:29 2009 +0000 @@ -26,7 +26,100 @@ DW::Worker::ContentImporter::Local::Comm These functions are part of the Saving API for comments. +=head2 C<< $class->get_comment_map( $user, $hashref ) + +Returns a hashref mapping import_source keys to jtalkids + =cut +sub get_comment_map { + my ( $class, $u ) = @_; + + my $p = LJ::get_prop( "talk", "import_source" ); + return {} unless $p; + + my $dbr = LJ::get_cluster_reader( $u ); + my %map; + my $sth = $dbr->prepare( "SELECT jtalkid, value FROM talkprop2 WHERE journalid = ? AND tpropid = ?" ); + + $sth->execute( $u->id, $p->{id} ); + + while ( my ( $jitemid, $value ) = $sth->fetchrow_array ) { + $map{$value} = $jitemid; + } + + return \%map; +} + +=head2 C<< $class->insert_comment( $u, $comment, $errref ) >> + +$comment is a hashref representation of a single comment, with the following format: + + { + subject => "Comment", + body => 'I DID STUFF!!!!!', + posterid => $local_userid, + + jitemid => $local_jitemid, + + parentid => $local_parent, + + state => 'A', + } + +$errref is a scalar reference to put any error text in. + +=cut + +sub insert_comment { + my ( $class, $u, $cmt, $errref ) = @_; + $errref ||= ''; + + # load the data we need to make this comment + use Data::Dumper; + warn Dumper( $cmt ) unless $cmt->{jitemid}; + + my $jitem = LJ::Entry->new( $u, jitemid => $cmt->{jitemid} ); + my $source = $jitem->prop( "import_source" ) . "?thread=" . ( $cmt->{id} << 8 ); + my $user = LJ::load_userid( $cmt->{posterid} ) + if $cmt->{posterid}; + + # fix the XML timestamp to a useful timestamp + my $date = $cmt->{date}; + $date =~ s/T/ /; + $date =~ s/Z//; + + # sometimes the date is empty + # FIXME: why? Dre had this, when can the date be empty? + $date ||= LJ::mysql_time(); + + # build the data structures we use. we are sort of faking it here. + my $comment = { + subject => $cmt->{subject}, + body => $cmt->{body}, + + state => $cmt->{state}, + u => $user, + + props => { + import_source => $source, + }, + + no_urls => 1, + no_esn => 1, + }; + + my $item = { + itemid => $cmt->{jitemid}, + }; + + my $parent = { + talkid => $cmt->{parentid}, + }; + + # now try to import it and return this as the error code + return LJ::Talk::Post::enter_imported_comment( $u, $parent, $item, $comment, $date, \$errref ); +} + 1; diff -r 89cf10bae98e -r 6115e6b86caf htdocs/misc/import.bml --- a/htdocs/misc/import.bml Tue Mar 10 15:34:40 2009 +0000 +++ b/htdocs/misc/import.bml Wed Mar 11 05:36:29 2009 +0000 @@ -59,6 +59,10 @@ body<= ); return "Database error." if $dbh->err; + # if comments are on, turn entries on + $POST{lj_entries} = 1 + if $POST{lj_comments}; + # okay, this is kinda hacky but turn on the right things so we can do # a proper entry import... if ( $POST{lj_entries} ) { @@ -74,6 +78,7 @@ body<= ['lj_friendgroups', 'ready'], # ['lj_friends', 'init' ], ['lj_entries', 'init' ], + ['lj_comments', 'init' ], ); # schedule userpic, bio, and tag imports @@ -163,6 +168,7 @@ EOF # checkbox easier this way my %opts = ( + lj_comments => [ 0, 'Import all comments. Implies: entries.' ], lj_entries => [ 0, 'Import all journal entries. Implies: tags, friendgroups.' ], lj_tags => [ 1, 'Import list of tags.' ], lj_userpics => [ 1, 'Import icons (aka userpics).' ], --------------------------------------------------------------------------------