mark: A photo of Mark kneeling on top of the Taal Volcano in the Philippines. It was a long hike. (Default)
Mark Smith ([staff profile] mark) wrote in [site community profile] changelog2009-03-11 05:36 am

[dw-free] Allow importing of your journal from another LiveJournal-based site.

[commit: http://hg.dwscoalition.org/dw-free/rev/6115e6b86caf]

http://bugs.dwscoalition.org/show_bug.cgi?id=114

Work on the comment import system. About 98% functional.

Patch by [staff profile] mark.

Files modified:
  • cgi-bin/DW/Worker/ContentImporter.pm
  • cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm
  • cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm
  • cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm
  • cgi-bin/DW/Worker/ContentImporter/LiveJournal/FriendGroups.pm
  • cgi-bin/DW/Worker/ContentImporter/Local/Comments.pm
  • htdocs/misc/import.bml
--------------------------------------------------------------------------------
diff -r 89cf10bae98e -r 6115e6b86caf cgi-bin/DW/Worker/ContentImporter.pm
--- a/cgi-bin/DW/Worker/ContentImporter.pm	Tue Mar 10 15:34:40 2009 +0000
+++ b/cgi-bin/DW/Worker/ContentImporter.pm	Wed Mar 11 05:36:29 2009 +0000
@@ -83,126 +83,7 @@ sub merge_watch {
 }
 
 
-=head2 C<< $class->post_event( $user, $hashref, $comment ) >>
-
-$event is a hashref representation of a single comment, with the following format:
-
-  {
-    subject => "Comment",
-    body => 'I DID STUFF!!!!!',
-    posterid => $local_userid,
-
-    jitemid => $local_jitemid,
-
-    parentid => $local_parent,
-
-    state => 'A',
-  }
-
-=cut
-sub insert_comment {
-    my ( $class, $u, $opts, $_comment ) = @_;
-
-    my $errref;
-
-    my $jitem = LJ::Entry->new( $u, jitemid=>$_comment->{jitemid} );
-    my $user = undef;
-    my $source = $jitem->prop( "import_source" ) . "?thread=" . ( $_comment->{id} << 8 );
-    $user = LJ::load_userid( $_comment->{posterid} ) if $_comment->{posterid};
-
-    my $date = $_comment->{date};
-    $date =~ s/T/ /;
-    $date =~ s/Z//;
-
-    my $comment = {
-        subject => $_comment->{subject},
-        body => $_comment->{body},
-
-        state => $_comment->{state},
-        u => $user,
-
-        props => {
-            import_source => $source,
-        },
-
-        no_urls => 1,
-        no_esn => 1,
-    };
-    my $item = {
-        itemid => $_comment->{jitemid},
-    };
-    my $parent = {
-        talkid => $_comment->{parentid},
-    };
-
-    unless ($date) {
-        my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst ) = gmtime();
-        $date = sprintf( "%4i-%2i-%2i %2i:%2i:%2i", 1900+$year, $mday, $mon, $hour, $min, $sec );
-    }
-
-    my $jtalkid = LJ::Talk::Post::enter_imported_comment( $u, $parent, $item, $comment, $date, \$errref );
-    return undef unless $jtalkid;
-    return $jtalkid;
-}
-
-=head2 C<< $class->get_comment_map( $user, $hashref )
-
-Returns a hashref mapping import_source keys to jtalkids
-
-=cut
-sub get_comment_map {
-    my ( $class, $u, $opts ) = @_;
-    return $opts->{talk_map} if $opts->{talk_map};
-
-    my $p = LJ::get_prop( "talk", "import_source" );
-    return {} unless $p;
-
-    my $dbr = LJ::get_cluster_reader( $u );
-    my %map;
-    my $sth = $dbr->prepare( "SELECT jtalkid, value FROM talkprop2 WHERE journalid = ? AND tpropid = ?" );
-
-    $sth->execute( $u->id, $p->{id} );
-
-    while ( my ($jitemid,$value) = $sth->fetchrow_array ) {
-        $map{$value} = $jitemid;
-    }
-
-    return \%map;
-}
-
 =head1 Helper Functions
-
-=head2 C<< $class->ratelimit_request( $hashref ) >>
-
-Imposes a ratelimit on the number of times this function can be called
-
-$hashref *must* be the same hash between calls, and must have a _rl_requests and _rl_seconds member.
-
-=cut
-sub ratelimit_request {
-    my ( $class, $hashref ) = @_;
-
-    # the next two lines load in the ratio - for example, a maximum of 4 requests in 1 second
-    my $num_requests = $hashref->{'_rl_requests'};
-    my $num_seconds  = $hashref->{'_rl_seconds'};
-
-    # $state is an arrayref containing timestamps
-    my $state = $hashref->{'_rl_delay_state'};
-    if ( !defined( $state ) ) {
-        $state = [];
-        $hashref->{'_rl_delay_state'} = $state;
-    }
-
-    my $now = time();
-    push( @{$state}, $now );
-    return if @{$state} < $num_requests;   # we haven't done enough requests to justify a wait yet
-
-    my $oldest = shift( @{$state} );
-    if ( ( $now - $oldest ) < $num_seconds ) {
-        sleep( $num_seconds - ( $now - $oldest ) );
-    }
-    return;
-}
 
 =head2 C<< $class->import_data( $userid, $import_data_id ) >>
 
diff -r 89cf10bae98e -r 6115e6b86caf cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm
--- a/cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm	Tue Mar 10 15:34:40 2009 +0000
+++ b/cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm	Wed Mar 11 05:36:29 2009 +0000
@@ -203,19 +203,6 @@ sub retry_delay {
     return ( 10, 30, 60, 300, 600 )[$fails];
 }
 
-sub do_authed_fetch {
-    my ( $opts, $mode, $startid, $numitems, $sess ) = @_;
-
-    # hit up the server with the specified information and return the raw content
-    my $ua = LWP::UserAgent->new;
-    my $request = HTTP::Request->new( GET => "http://$opts->{server}/export_comments.bml?get=$mode&startid=$startid&numitems=$numitems" );
-    $request->push_header( Cookie => "ljsession=$sess" );
-    my $response = $ua->request( $request );
-    return if $response->is_error;
-    my $xml = $response->content;
-    return $xml if $xml;
-}
-
 ##############################################################################
 # MASON DIXON LINE \o/
 # South of here, these functions have been updated with the changes Mark is
@@ -334,12 +321,14 @@ sub get_remapped_userids {
             unless defined $oid;
     }
 
-    unless ( defined $fid ) {
-        warn "[$$] Remapping feed userid of $data->{hostname}:$user\n";
-        $fid = $class->remap_username_feed( $data, $user );
-        warn "     FEED USERID IS STILL UNDEFINED\n"
-            unless defined $fid;
-    }
+# FIXME: this is temporarily disabled while we hash out exactly how we want
+# this functionality to work.
+#    unless ( defined $fid ) {
+#        warn "[$$] Remapping feed userid of $data->{hostname}:$user\n";
+#        $fid = $class->remap_username_feed( $data, $user );
+#        warn "     FEED USERID IS STILL UNDEFINED\n"
+#            unless defined $fid;
+#    }
 
     $dbh->do( 'REPLACE INTO import_usermap (hostname, username, identity_userid, feed_userid) VALUES (?, ?, ?, ?)',
               undef, $data->{hostname}, $user, $oid, $fid );
@@ -423,18 +412,13 @@ sub remap_lj_user {
 }
 
 sub get_lj_session {
-    my $imp = $_[0];
+    my ( $class, $imp ) = @_;
 
-    my $r = call_xmlrpc( $imp, 'sessiongenerate', { expiration => 'short' } );
+    my $r = $class->call_xmlrpc( $imp, 'sessiongenerate', { expiration => 'short' } );
     return undef
         unless $r && ! $r->{fault};
 
     return $r->{ljsession};
-}
-
-sub d {
-    warn shift(@_) . "\n"
-        if $LJ::IS_DEV_SERVER;
 }
 
 sub xmlrpc_call_helper {
diff -r 89cf10bae98e -r 6115e6b86caf cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm
--- a/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm	Tue Mar 10 15:34:40 2009 +0000
+++ b/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm	Wed Mar 11 05:36:29 2009 +0000
@@ -20,6 +20,7 @@ use base 'DW::Worker::ContentImporter::L
 use base 'DW::Worker::ContentImporter::LiveJournal';
 
 use Carp qw/ croak confess /;
+use Time::HiRes qw/ tv_interval gettimeofday /;
 use DW::Worker::ContentImporter::Local::Comments;
 
 # these come from LJ
@@ -40,215 +41,329 @@ sub try_work {
     my ( $class, $job ) = @_;
     my $opts = $job->arg;
     my $data = $class->import_data( $opts->{userid}, $opts->{import_data_id} );
+    my $begin_time = [ gettimeofday() ];
 
     # failure wrappers for convenience
     my $fail      = sub { return $class->fail( $data, 'lj_comments', $job, @_ ); };
     my $ok        = sub { return $class->ok( $data, 'lj_comments', $job ); };
     my $temp_fail = sub { return $class->temp_fail( $data, 'lj_comments', $job, @_ ); };
+    my $status    = sub { return $class->status( $data, 'lj_comments', { @_ } ); };
+
+    # logging sub
+    my ( $logfile, $last_log_time );
+    my $log = sub {
+        $last_log_time ||= [ gettimeofday() ];
+
+        unless ( $logfile ) {
+            mkdir "$LJ::HOME/logs/imports";
+            mkdir "$LJ::HOME/logs/imports/$opts->{userid}";
+            open $logfile, ">>$LJ::HOME/logs/imports/$opts->{userid}/$opts->{import_data_id}.lj_comments.$$"
+                or return $temp_fail->( 'Internal server error creating log.' );
+            print $logfile "[0.00s 0.00s] Log started at " . LJ::mysql_time(gmtime()) . ".\n";
+        }
+
+        my $fmt = "[%0.4fs %0.1fs] " . shift() . "\n";
+        my $msg = sprintf( $fmt, tv_interval( $last_log_time ), tv_interval( $begin_time), @_ );
+
+        print $logfile $msg;
+        $job->debug( $msg );
+
+        $last_log_time = [ gettimeofday() ];
+    };
 
     # setup
     my $u = LJ::load_userid( $data->{userid} )
         or return $fail->( 'Unable to load target with id %d.', $data->{userid} );
+    $log->( 'Import begun for %s(%d).', $u->user, $u->userid );
 
-    # temporary failure, this code hasn't been ported yet
-    return $fail->( 'oops, not ready yet' );
-}
+    # this will take a entry_map (old URL -> new jitemid) and convert it into a jitemid map (old jitemid -> new jitemid)
+    my $entry_map = DW::Worker::ContentImporter::Local::Entries->get_entry_map( $u ) || {};
+    $log->( 'Loaded entry map with %d entries.', scalar( keys %$entry_map ) );
 
-1;
-__END__
+    # now backfill into jitemid_map
+    my $jitemid_map = {};
+    foreach my $url ( keys %$entry_map ) {
+        my $jitemid = $1 >> 8
+            if $url =~ m!/(\d+)\.html$!;
+        $jitemid_map->{$jitemid} = $entry_map->{$url};
+    }
+    
+    # this will take a talk_map (old URL -> new jtalkid) and convert it to a jtalkid map (old jtalkid -> new jtalkid)
+    my $talk_map = DW::Worker::ContentImporter::Local::Comments->get_comment_map( $u ) || {};
+    $log->( 'Loaded comment map with %d entries.', scalar( keys %$talk_map ) );
 
-### WORK GOES HERE
-$opts->{identity_map} ||= {};
+    # now reverse it as above
+    my $jtalkid_map = {};
+    foreach my $url ( keys %$talk_map ) {
+        my $jtalkid = $1 >> 8
+            if $url =~ m!thread=(\d+)$!;
+        $jtalkid_map->{$jtalkid} = $talk_map->{$url};
+    }
 
-# this will take a entry_map (old URL -> new jitemid) and convert it into a jitemid map (old jitemid -> new jitemid)
-# TODO: Make sure you are dealing with the correct site.
-unless ( $opts->{jitemid_map} ) {
-    $opts->{entry_map} ||= DW::Worker::ContentImporter->get_entry_map($u,$opts);
-    $opts->{jitemid_map} = {};
-    foreach my $url ( keys %{$opts->{entry_map}} ) {
-        next unless $url =~ m/$opts->{user_path}/;
-        my ($ditemid) = $url =~ m/\/([0-9]+)\.html?$/;
-        my $jitemid = $ditemid >> 8;
-        $opts->{jitemid_map}->{$jitemid} = $opts->{entry_map}->{$url};
+    # parameters for below
+    my ( %meta, @userids, $identity_map );
+    my ( $maxid, $server_max_id, $server_next_id, $lasttag ) = ( 0, 0, 1, '' );
+    
+    # setup our parsing function
+    my $meta_handler = sub {
+        # this sub actually processes incoming meta information
+        $lasttag = $_[1];
+        shift; shift;      # remove the Expat object and tag name
+        my %temp = ( @_ ); # take the rest into our humble hash
+
+        # if we were last getting a comment, start storing the info
+        if ( $lasttag eq 'comment' ) {
+            # get some data on a comment
+            $meta{$temp{id}} = {
+                id => $temp{id},
+                posterid => $temp{posterid}+0,
+                state => $temp{state} || 'A',
+            };
+
+        } elsif ( $lasttag eq 'usermap' && ! exists $identity_map->{$temp{id}} ) {
+            push @userids, $temp{id};
+
+            my ( $local_oid, $local_fid ) = $class->get_remapped_userids( $data, $temp{user} );
+            $identity_map->{$temp{id}} = $local_oid;
+
+            $log->( 'Mapped remote %s(%d) to local userid %d.', $temp{user}, $temp{id}, $local_oid );
+        }
+    };
+    my $meta_closer = sub {
+        # we hit a closing tag so we're not in a tag anymore
+        $lasttag = '';
+    };
+    my $meta_content = sub {
+        # if we're in a maxid tag, we want to save that value so we know how much further
+        # we have to go in downloading meta info
+        return undef
+            unless $lasttag eq 'maxid' || 
+                   $lasttag eq 'nextid';
+
+        # save these values for later
+        $server_max_id = $_[1] + 0 if $lasttag eq 'maxid';
+        $server_next_id = $_[1] + 0 if $lasttag eq 'nextid';
+    };
+    
+    # hit up the server for metadata
+    while ( defined $server_next_id && $server_next_id =~ /^\d+$/ ) {
+        $log->( 'Fetching metadata; max_id = %d, next_id = %d.', $server_max_id || 0, $server_next_id || 0 );
+
+        my $content = $class->do_authed_comment_fetch(
+            $data, 'comment_meta', $server_next_id, $COMMENTS_FETCH_META
+        );
+        return $temp_fail->( 'Error fetching comment metadata from server.' )
+            unless $content;
+    
+        $server_next_id = undef;
+    
+        # now we want to XML parse this
+        my $parser = new XML::Parser(
+            Handlers => {
+                Start => $meta_handler,
+                Char  => $meta_content,
+                End   => $meta_closer
+            }
+        );
+        $parser->parse( $content );
     }
-}
+    $log->( 'Finished fetching metadata.' );
 
-# this will take a talk_map (old URL -> new jtalkid) and convert it to a jtalkid map (old jtalkid -> new jtalkid)
-# TODO: Make sure you are dealing with the correct site.
-unless ( $opts->{jtalkid_map} ) {
-    $opts->{talk_map} ||= DW::Worker::ContentImporter->get_comment_map( $u, $opts );
-    $opts->{jtalkid_map} = {};
-    foreach my $url ( keys %{$opts->{talk_map}} ) {
-        next unless $url =~ m/$opts->{user_path}/;
-        my ( $dtalkid ) = $url =~ m/\?thread=([0-9]+)$/;
-        my $jtalkid = $dtalkid >> 8;
-        $opts->{jtalkid_map}->{$jtalkid} = $opts->{talk_map}->{$url};
+    # body handling section now
+    my ( $lastid, $curid, @tags ) = ( 0, 0 );
+
+    # setup our handlers for body XML info
+    my $body_handler = sub {
+        # this sub actually processes incoming body information
+        $lasttag = $_[1];
+        push @tags, $lasttag;
+        shift; shift;      # remove the Expat object and tag name
+        my %temp = ( @_ ); # take the rest into our humble hash
+        if ( $lasttag eq 'comment' ) {
+            # get some data on a comment
+            $curid = $temp{id};
+            $meta{$curid}{parentid} = $temp{parentid}+0;
+            $meta{$curid}{jitemid} = $temp{jitemid}+0;
+        }
+    };
+    my $body_closer = sub {
+        # we hit a closing tag so we're not in a tag anymore
+        my $tag = pop @tags;
+        $lasttag = $tags[0];
+    };
+    my $body_content = sub {
+        # this grabs data inside of comments: body, subject, date
+        return unless $curid;
+        return unless $lasttag =~ /(?:body|subject|date)/;
+        $meta{$curid}{$lasttag} .= $_[1];
+        # have to .= it, because the parser will split on punctuation such as an apostrophe
+        # that may or may not be in the data stream, and we won't know until we've already
+        # gotten some data
+    };
+    
+    # start looping to fetch all of the comment bodies
+    while ( $lastid < $server_max_id ) {
+        $log->( 'Fetching bodydata; last_id = %d, max_id = %d.', $lastid || 0, $server_max_id || 0 );
+
+        my $content = $class->do_authed_comment_fetch(
+            $data, 'comment_body', $lastid+1, $COMMENTS_FETCH_BODY
+        );
+        return $temp_fail->( 'Error fetching comment body data from server.' )
+            unless $content;
+
+        # now we want to XML parse this
+        my $parser = new XML::Parser(
+            Handlers => {
+                Start => $body_handler,
+                Char  => $body_content,
+                End   => $body_closer
+            }
+        );
+        $parser->parse( $content );
+    
+        # the exporter should always return the maximum number of items, so loop again.  of course,
+        # this will fail nicely as soon as some site we're importing from reduces the max items
+        # they return due to load.  http://community.livejournal.com/changelog/5907095.html
+        $lastid += $COMMENTS_FETCH_BODY;
     }
-}
+    
+    # now iterate over each comment and build the nearly final structure
+    foreach my $comment ( values %meta ) {
 
-# downloaded meta data information
-my %meta;
-my @userids;
-
-# setup our parsing function
-my $maxid = 0;
-my $server_max_id = 0;
-my $server_next_id = 1;
-my $lasttag = '';
-my $meta_handler = sub {
-    # this sub actually processes incoming meta information
-    $lasttag = $_[1];
-    shift; shift;      # remove the Expat object and tag name
-    my %temp = ( @_ ); # take the rest into our humble hash
-    if ( $lasttag eq 'comment' ) {
-        # get some data on a comment
-        $meta{$temp{id}} = {
-            id => $temp{id},
-            posterid => $temp{posterid}+0,
-            state => $temp{state} || 'A',
-        };
-    } elsif ( $lasttag eq 'usermap' && !$opts->{identity_map}->{$temp{id}} ) {
-        push @userids, $temp{id};
-        $opts->{identity_map}->{$temp{id}} = remap_username_friend( $opts, $temp{user} );
-    }
-};
-my $meta_closer = sub {
-    # we hit a closing tag so we're not in a tag anymore
-    $lasttag = '';
-};
-my $meta_content = sub {
-    # if we're in a maxid tag, we want to save that value so we know how much further
-    # we have to go in downloading meta info
-    return unless ( $lasttag eq 'maxid' ) || ( $lasttag eq 'nextid' );
-    $server_max_id = $_[1] + 0 if ( $lasttag eq 'maxid' );
-    $server_next_id = $_[1] + 0 if ( $lasttag eq 'nextid' );
-};
-
-# hit up the server for metadata
-while ( defined $server_next_id && $server_next_id =~ /^\d+$/ ) {
-    DW::Worker::ContentImporter->ratelimit_request( $opts );
-    my $content = do_authed_fetch( $opts, 'comment_meta', $server_next_id, $COMMENTS_FETCH_META, $session );
-    #die "Some sort of error fetching metadata from server" unless $content;
-
-    $server_next_id = undef;
-
-    # now we want to XML parse this
-    my $parser = new XML::Parser( Handlers => { Start => $meta_handler, Char => $meta_content, End => $meta_closer } );
-    $parser->parse( $content );
-}
-
-# setup our handlers for body XML info
-my $lastid = 0;
-my $curid = 0;
-my @tags;
-my $body_handler = sub {
-    # this sub actually processes incoming body information
-    $lasttag = $_[1];
-    push @tags, $lasttag;
-    shift; shift;      # remove the Expat object and tag name
-    my %temp = ( @_ ); # take the rest into our humble hash
-    if ( $lasttag eq 'comment' ) {
-        # get some data on a comment
-        $curid = $temp{id};
-        $meta{$curid}{parentid} = $temp{parentid}+0;
-        $meta{$curid}{jitemid} = $temp{jitemid}+0;
-        # line below commented out because we shouldn't be trying to be clever like this ;p
-        # $lastid = $curid if $curid > $lastid;
-    }
-};
-my $body_closer = sub {
-    # we hit a closing tag so we're not in a tag anymore
-    my $tag = pop @tags;
-    $lasttag = $tags[0];
-};
-my $body_content = sub {
-    # this grabs data inside of comments: body, subject, date
-    return unless $curid;
-    return unless $lasttag =~ /(?:body|subject|date)/;
-    $meta{$curid}{$lasttag} .= $_[1];
-    # have to .= it, because the parser will split on punctuation such as an apostrophe
-    # that may or may not be in the data stream, and we won't know until we've already
-    # gotten some data
-};
-
-# at this point we have a fully regenerated metadata cache and we want to grab a block of comments
-while ( 1 ) {
-    DW::Worker::ContentImporter->ratelimit_request( $opts );
-    my $content = do_authed_fetch( $opts, 'comment_body', $lastid+1, $COMMENTS_FETCH_BODY, $session );
-
-    # now we want to XML parse this
-    my $parser = new XML::Parser( Handlers => { Start => $body_handler, Char => $body_content, End => $body_closer } );
-    $parser->parse( $content );
-
-    # now at this point what we have to decide whether we should loop again for more metadata
-    $lastid += $COMMENTS_FETCH_BODY;
-    last unless $lastid < $server_max_id;
-}
-
-foreach my $comment ( values %meta ) {
-    $comment->{posterid} = $opts->{identity_map}->{$comment->{posterid}};
-    $comment->{jitemid} = $opts->{jitemid_map}->{$comment->{jitemid}};
-
-    $comment->{unresolved} = 1 if ($comment->{parentid});
-
-    my $body = remap_lj_user($opts,$comment->{body});
-    $body =~ s/<.+?-embed-.+?>//g;
-    $body =~ s/<.+?-template-.+?>//g;
-    $comment->{body} = $body;
-
-    $comment->{orig_id} = $comment->{id};
-
-    if ($comment->{parentid} && $comment->{state} ne 'D') {
-        $meta{$comment->{parentid}}->{has_children} = 1;
-    }
-}
-
-my @to_import = sort { ( $a->{id}+0 ) <=> ( $b->{id}+0 ) } values %meta;
-my $had_unresolved = 1;
-# This loop should never need to run through more then once
-# but, it will *if* for some reason a comment comes before its parent
-# which *should* never happen, but I'm handling it anyway, just in case.
-while ($had_unresolved) {
-    $had_unresolved = 0;
-    my $ct = 0;
-    my $ct_unresolved = 0;
-    foreach my $comment (@to_import) {
-        next if $comment->{done}; # Skip this comment if it was already imported this round
-        next if $opts->{jtalkid_map}->{$comment->{orig_id}}; # Or on a previous import round
-        next if ( $comment->{state} eq 'D' && !$comment->{has_children} ); # Or if the comment is deleted, and child-less
-        $ct++;
-        if ( $comment->{unresolved} ) {
-            # lets see if this is resolvable at the moment
-            # A resolvable comment is a comment that's parent is already in the DW database
-            # and an unresolved comment is a comment that has a parent that is currently not in the database.
-            if ( $opts->{jtalkid_map}->{$comment->{parentid}} ) {
-                $comment->{parentid} = $opts->{jtalkid_map}->{$comment->{parentid}};
-                $comment->{unresolved} = 0;
-            }
-        }
-        if ( $comment->{unresolved} ) {
-            $ct_unresolved++;
-            $had_unresolved = 1;
+        # if we weren't able to map to a jitemid (last entry import a while ago?)
+        # or some other problem, log it and bail
+        unless ( $jitemid_map->{$comment->{jitemid}} ) {
+            $comment->{skip} = 1;
+            $log->( 'NO MAPPED ENTRY: remote values: jitemid %d, posterid %d, jtalkid %d.',
+                    $comment->{jitemid}, $comment->{posterid}, $comment->{id} );
             next;
         }
-        my $talkid = DW::Worker::ContentImporter->insert_comment( $u, $opts, $comment );
-        $opts->{jtalkid_map}->{$comment->{id}} = $talkid;
-        $comment->{id} = $talkid;
-        $comment->{done} = 1;
+
+        # basic mappings
+        $comment->{posterid} = $identity_map->{$comment->{posterid}};
+        $comment->{jitemid} = $jitemid_map->{$comment->{jitemid}};
+        $comment->{orig_id} = $comment->{id};
+    
+        # unresolved comments means we haven't got the parent in the database
+        # yet so we can't post this one
+        $comment->{unresolved} = 1
+            if $comment->{parentid};
+    
+        # the reverse of unresolved, tell the parent it has visible children
+        $meta{$comment->{parentid}}->{has_children} = 1
+            if $comment->{parentid} && $comment->{state} ne 'D';
+
+        # remap content (user links) then remove embeds/templates
+        my $body = $class->remap_lj_user( $data, $comment->{body} );
+        $body =~ s/<.+?-embed-.+?>/[Embedded content removed during import.]/g;
+        $body =~ s/<.+?-template-.+?>/[Templated content removed during import.]/g;
+        $comment->{body} = $body;
     }
-    # Sanity check. This *really* should never happen.
-    # This is here to prevent an endless loop, just in case.
-    # The only way I can see this firing is if a comment is just
-    # totally missing.
-    if ( $ct == $ct_unresolved && $had_unresolved ) {
-        # FIXME: Error
-        $had_unresolved = 0; # Set this to 0 so the loop falls through
+    
+    # variable setup for the database work
+    my @to_import = sort { ( $a->{id}+0 ) <=> ( $b->{id}+0 ) } values %meta;
+    my $had_unresolved = 1;
+
+    # This loop should never need to run through more then once
+    # but, it will *if* for some reason a comment comes before its parent
+    # which *should* never happen, but I'm handling it anyway, just in case.
+    while ( $had_unresolved ) {
+
+        # variables, and reset
+        my ( $ct, $ct_unresolved ) = ( 0, 0 );
+        $had_unresolved = 0;
+
+        # now doing imports!
+        foreach my $comment ( @to_import ) {
+            next if $comment->{skip};
+
+            $log->( "Attempting to import remote id %d, parentid %d, state %s.",
+                    $comment->{orig_id}, $comment->{parentid}, $comment->{state} );
+
+            # rules we might skip a content with
+            next if $comment->{done}; # Skip this comment if it was already imported this round
+            next if $jtalkid_map->{$comment->{orig_id}}; # Or on a previous import round
+            next if $comment->{state} eq 'D' && !$comment->{has_children}; # Or if the comment is deleted, and child-less
+
+            # now we know this one is going in the database
+            $ct++;
+
+            # try to resolve
+            if ( $comment->{unresolved} ) {
+                # lets see if this is resolvable at the moment
+                # A resolvable comment is a comment that's parent is already in the DW database
+                # and an unresolved comment is a comment that has a parent that is currently not in the database.
+                if ( $jtalkid_map->{$comment->{parentid}} ) {
+                    $comment->{parentid} = $jtalkid_map->{$comment->{parentid}};
+                    $comment->{unresolved} = 0;
+
+                    $log->( 'Resolved unresolved comment to local parentid %d.',
+                            $comment->{parentid} );
+
+                } else {
+                    # guess we couldn't resolve it :( next pass!
+                    $ct_unresolved++;
+                    $had_unresolved = 1;
+
+                    $log->( 'Failed to resolve comment.' );
+
+                    next;
+                }
+            }
+
+            # if we get here we're good to insert into the database
+            my $err = "";
+            my $talkid = DW::Worker::ContentImporter::Local::Comments->insert_comment( $u, $comment, \$err );
+            if ( $talkid ) {
+                $log->( 'Successfully imported source %d to new jtalkid %d.', $comment->{id}, $talkid );
+            } else {
+                $log->( 'Failed to import comment %d: %s.', $comment->{id}, $err );
+                return $temp_fail->( 'Failure importing comment: %s.', $err );
+            }
+
+            # store this information
+            $jtalkid_map->{$comment->{id}} = $talkid;
+            $comment->{id} = $talkid;
+            $comment->{done} = 1;
+        }
+
+        # Sanity check. This *really* should never happen.
+        # This is here to prevent an endless loop, just in case.
+        # The only way I can see this firing is if a comment is just
+        # totally missing.
+        if ( $ct == $ct_unresolved && $had_unresolved ) {
+            $log->( 'The unthinkable happened!  We hit an iceberg!!!' );
+            $log->( 'The above error: %d == %d && %d (had_unresolved).', $ct, $ct_unresolved, $had_unresolved );
+            return $fail->( 'Found unresolvable comment chain.' );
+        }
     }
-}
-$opts->{no_comments} = 1;
-
+    
     return $ok->();
 }
 
 
-1;
\ No newline at end of file
+sub do_authed_comment_fetch {
+    my ( $class, $data, $mode, $startid, $numitems ) = @_;
+
+    # if we don't have a session, then let's generate one
+    $data->{_session} ||= $class->get_lj_session( $data );
+
+    # hit up the server with the specified information and return the raw content
+    my $ua = LWP::UserAgent->new;
+    my $request = HTTP::Request->new( GET => "http://www.$data->{hostname}/export_comments.bml?get=$mode&startid=$startid&numitems=$numitems" );
+    $request->push_header( Cookie => "ljsession=$data->{_session}" );
+
+    # try to get the response
+    my $response = $ua->request( $request );
+    return if $response->is_error;
+
+    # now get the content
+    my $xml = $response->content;
+    return $xml if $xml;
+
+    # total failure...
+    return undef;
+}
+
+
+1;
diff -r 89cf10bae98e -r 6115e6b86caf cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm
--- a/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm	Tue Mar 10 15:34:40 2009 +0000
+++ b/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm	Wed Mar 11 05:36:29 2009 +0000
@@ -99,8 +99,23 @@ sub try_work {
         $log->( '    retrieved %d items and %d left to sync', $hash->{count}, $hash->{total} );
         last if $hash->{count} == $hash->{total};
     }
-    $log->( 'Syncitems finished.' );
+    $log->( 'Syncitems finished with %d items pre-prune.', scalar( keys %sync ) );
 
+    # this is an optimization.  since we never do an edit event (only post!) we will
+    # never get changes anyway.  so let's remove from the list of things to sync any
+    # post that we already know about.  (not that we really care, but it's much nicer
+    # on people we're pulling from.)
+    foreach my $url ( keys %$entry_map ) {
+        unless ( $url =~ m!/(\d+)\.html$! ) {
+            $log->( 'URL %s not of expected format in prune.', $url );
+            next;
+        }
+
+        delete $sync{$1 >> 8};
+    }
+    $log->( 'Syncitems now has %d items post-prune.', scalar( keys %sync ) );
+
+    # simple helper sub
     my $realtime = sub {
         my $id = shift;
         return $sync{$id}->[1] if @{$sync{$id} || []};
@@ -212,6 +227,15 @@ sub try_work {
         $log->( '    counted %d entries, lastgrab is now %s.', $count, $lastgrab );
     }
 
+    # mark the comments mode as ready to schedule
+    my $dbh = LJ::get_db_writer();
+    $dbh->do(
+        q{UPDATE import_items SET status = 'ready'
+          WHERE userid = ? AND item IN ('lj_comments')
+          AND import_data_id = ? AND status = 'init'},
+        undef, $u->id, $opts->{import_data_id}        
+    );
+
     return $ok->();
 }
 
diff -r 89cf10bae98e -r 6115e6b86caf cgi-bin/DW/Worker/ContentImporter/LiveJournal/FriendGroups.pm
--- a/cgi-bin/DW/Worker/ContentImporter/LiveJournal/FriendGroups.pm	Tue Mar 10 15:34:40 2009 +0000
+++ b/cgi-bin/DW/Worker/ContentImporter/LiveJournal/FriendGroups.pm	Wed Mar 11 05:36:29 2009 +0000
@@ -56,13 +56,14 @@ sub try_work {
 
     my $map = DW::Worker::ContentImporter::Local::TrustGroups->merge_trust_groups( $u, $r->{friendgroups} );
 
-    # mark lj_friends item as able to be scheduled now, and save the map
+    # store the merged map
     $dbh->do(
         q{UPDATE import_data SET groupmap = ?
           WHERE userid = ? AND import_data_id = ?},
         undef, nfreeze( $map ), $u->id, $opts->{import_data_id}
     );
 
+    # mark lj_friends item as able to be scheduled now, and save the map
 # FIXME: what do we do on error case? well, hopefully that will be rare...
     $dbh->do(
         q{UPDATE import_items SET status = 'ready'
diff -r 89cf10bae98e -r 6115e6b86caf cgi-bin/DW/Worker/ContentImporter/Local/Comments.pm
--- a/cgi-bin/DW/Worker/ContentImporter/Local/Comments.pm	Tue Mar 10 15:34:40 2009 +0000
+++ b/cgi-bin/DW/Worker/ContentImporter/Local/Comments.pm	Wed Mar 11 05:36:29 2009 +0000
@@ -26,7 +26,100 @@ DW::Worker::ContentImporter::Local::Comm
 
 These functions are part of the Saving API for comments.
 
+=head2 C<< $class->get_comment_map( $user, $hashref )
+
+Returns a hashref mapping import_source keys to jtalkids
+
 =cut
 
+sub get_comment_map {
+    my ( $class, $u ) = @_;
+
+    my $p = LJ::get_prop( "talk", "import_source" );
+    return {} unless $p;
+
+    my $dbr = LJ::get_cluster_reader( $u );
+    my %map;
+    my $sth = $dbr->prepare( "SELECT jtalkid, value FROM talkprop2 WHERE journalid = ? AND tpropid = ?" );
+
+    $sth->execute( $u->id, $p->{id} );
+
+    while ( my ( $jitemid, $value ) = $sth->fetchrow_array ) {
+        $map{$value} = $jitemid;
+    }
+
+    return \%map;
+}
+
+=head2 C<< $class->insert_comment( $u, $comment, $errref ) >>
+
+$comment is a hashref representation of a single comment, with the following format:
+
+  {
+    subject => "Comment",
+    body => 'I DID STUFF!!!!!',
+    posterid => $local_userid,
+
+    jitemid => $local_jitemid,
+
+    parentid => $local_parent,
+
+    state => 'A',
+  }
+
+$errref is a scalar reference to put any error text in.
+
+=cut
+
+sub insert_comment {
+    my ( $class, $u, $cmt, $errref ) = @_;
+    $errref ||= '';
+
+    # load the data we need to make this comment
+    use Data::Dumper;
+    warn Dumper( $cmt ) unless $cmt->{jitemid};
+
+    my $jitem = LJ::Entry->new( $u, jitemid => $cmt->{jitemid} );
+    my $source = $jitem->prop( "import_source" ) . "?thread=" . ( $cmt->{id} << 8 );
+    my $user = LJ::load_userid( $cmt->{posterid} )
+        if $cmt->{posterid};
+
+    # fix the XML timestamp to a useful timestamp
+    my $date = $cmt->{date};
+    $date =~ s/T/ /;
+    $date =~ s/Z//;
+
+    # sometimes the date is empty
+    # FIXME: why?  Dre had this, when can the date be empty?
+    $date ||= LJ::mysql_time();
+
+    # build the data structures we use.  we are sort of faking it here.
+    my $comment = {
+        subject => $cmt->{subject},
+        body => $cmt->{body},
+
+        state => $cmt->{state},
+        u => $user,
+
+        props => {
+            import_source => $source,
+        },
+
+        no_urls => 1,
+        no_esn => 1,
+    };
+
+    my $item = {
+        itemid => $cmt->{jitemid},
+    };
+
+    my $parent = {
+        talkid => $cmt->{parentid},
+    };
+
+    # now try to import it and return this as the error code
+    return LJ::Talk::Post::enter_imported_comment( $u, $parent, $item, $comment, $date, \$errref );
+}
+
 
 1;
diff -r 89cf10bae98e -r 6115e6b86caf htdocs/misc/import.bml
--- a/htdocs/misc/import.bml	Tue Mar 10 15:34:40 2009 +0000
+++ b/htdocs/misc/import.bml	Wed Mar 11 05:36:29 2009 +0000
@@ -59,6 +59,10 @@ body<=
         );
         return "Database error." if $dbh->err;
 
+        # if comments are on, turn entries on
+        $POST{lj_entries} = 1
+            if $POST{lj_comments};
+
         # okay, this is kinda hacky but turn on the right things so we can do
         # a proper entry import...
         if ( $POST{lj_entries} ) {
@@ -74,6 +78,7 @@ body<=
             ['lj_friendgroups', 'ready'],
 #            ['lj_friends',      'init' ],
             ['lj_entries',      'init' ],
+            ['lj_comments',     'init' ],
         );
 
         # schedule userpic, bio, and tag imports
@@ -163,6 +168,7 @@ EOF
 
     # checkbox easier this way
     my %opts = (
+        lj_comments     => [ 0, 'Import all comments.  Implies: entries.' ],
         lj_entries      => [ 0, 'Import all journal entries.  Implies: tags, friendgroups.' ],
         lj_tags         => [ 1, 'Import list of tags.' ],
         lj_userpics     => [ 1, 'Import icons (aka userpics).' ],
--------------------------------------------------------------------------------

Post a comment in response:

This account has disabled anonymous posting.
If you don't have an account you can create one now.
HTML doesn't work in the subject.
More info about formatting

If you are unable to use this captcha for any reason, please contact us by email at support@dreamwidth.org