changelog | [dw-free] Allow importing of your journal from another LiveJournal-based site.

[commit: http://hg.dwscoalition.org/dw-free/rev/6115e6b86caf]

http://bugs.dwscoalition.org/show_bug.cgi?id=114

Work on the comment import system. About 98% functional.

Patch by
mark.

Files modified:
cgi-bin/DW/Worker/ContentImporter.pm
cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm
cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm
cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm
cgi-bin/DW/Worker/ContentImporter/LiveJournal/FriendGroups.pm
cgi-bin/DW/Worker/ContentImporter/Local/Comments.pm
htdocs/misc/import.bml
--------------------------------------------------------------------------------
diff -r 89cf10bae98e -r 6115e6b86caf cgi-bin/DW/Worker/ContentImporter.pm
--- a/cgi-bin/DW/Worker/ContentImporter.pm	Tue Mar 10 15:34:40 2009 +0000
+++ b/cgi-bin/DW/Worker/ContentImporter.pm	Wed Mar 11 05:36:29 2009 +0000
@@ -83,126 +83,7 @@ sub merge_watch {
 }
 
 
-=head2 C<< $class->post_event( $user, $hashref, $comment ) >>
-
-$event is a hashref representation of a single comment, with the following format:
-
-  {
-    subject => "Comment",
-    body => 'I DID STUFF!!!!!',
-    posterid => $local_userid,
-
-    jitemid => $local_jitemid,
-
-    parentid => $local_parent,
-
-    state => 'A',
-  }
-
-=cut
-sub insert_comment {
-    my ( $class, $u, $opts, $_comment ) = @_;
-
-    my $errref;
-
-    my $jitem = LJ::Entry->new( $u, jitemid=>$_comment->{jitemid} );
-    my $user = undef;
-    my $source = $jitem->prop( "import_source" ) . "?thread=" . ( $_comment->{id} << 8 );
-    $user = LJ::load_userid( $_comment->{posterid} ) if $_comment->{posterid};
-
-    my $date = $_comment->{date};
-    $date =~ s/T/ /;
-    $date =~ s/Z//;
-
-    my $comment = {
-        subject => $_comment->{subject},
-        body => $_comment->{body},
-
-        state => $_comment->{state},
-        u => $user,
-
-        props => {
-            import_source => $source,
-        },
-
-        no_urls => 1,
-        no_esn => 1,
-    };
-    my $item = {
-        itemid => $_comment->{jitemid},
-    };
-    my $parent = {
-        talkid => $_comment->{parentid},
-    };
-
-    unless ($date) {
-        my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst ) = gmtime();
-        $date = sprintf( "%4i-%2i-%2i %2i:%2i:%2i", 1900+$year, $mday, $mon, $hour, $min, $sec );
-    }
-
-    my $jtalkid = LJ::Talk::Post::enter_imported_comment( $u, $parent, $item, $comment, $date, \$errref );
-    return undef unless $jtalkid;
-    return $jtalkid;
-}
-
-=head2 C<< $class->get_comment_map( $user, $hashref )
-
-Returns a hashref mapping import_source keys to jtalkids
-
-=cut
-sub get_comment_map {
-    my ( $class, $u, $opts ) = @_;
-    return $opts->{talk_map} if $opts->{talk_map};
-
-    my $p = LJ::get_prop( "talk", "import_source" );
-    return {} unless $p;
-
-    my $dbr = LJ::get_cluster_reader( $u );
-    my %map;
-    my $sth = $dbr->prepare( "SELECT jtalkid, value FROM talkprop2 WHERE journalid = ? AND tpropid = ?" );
-
-    $sth->execute( $u->id, $p->{id} );
-
-    while ( my ($jitemid,$value) = $sth->fetchrow_array ) {
-        $map{$value} = $jitemid;
-    }
-
-    return \%map;
-}
-
 =head1 Helper Functions
-
-=head2 C<< $class->ratelimit_request( $hashref ) >>
-
-Imposes a ratelimit on the number of times this function can be called
-
-$hashref *must* be the same hash between calls, and must have a _rl_requests and _rl_seconds member.
-
-=cut
-sub ratelimit_request {
-    my ( $class, $hashref ) = @_;
-
-    # the next two lines load in the ratio - for example, a maximum of 4 requests in 1 second
-    my $num_requests = $hashref->{'_rl_requests'};
-    my $num_seconds  = $hashref->{'_rl_seconds'};
-
-    # $state is an arrayref containing timestamps
-    my $state = $hashref->{'_rl_delay_state'};
-    if ( !defined( $state ) ) {
-        $state = [];
-        $hashref->{'_rl_delay_state'} = $state;
-    }
-
-    my $now = time();
-    push( @{$state}, $now );
-    return if @{$state} < $num_requests;   # we haven't done enough requests to justify a wait yet
-
-    my $oldest = shift( @{$state} );
-    if ( ( $now - $oldest ) < $num_seconds ) {
-        sleep( $num_seconds - ( $now - $oldest ) );
-    }
-    return;
-}
 
 =head2 C<< $class->import_data( $userid, $import_data_id ) >>
 
diff -r 89cf10bae98e -r 6115e6b86caf cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm
--- a/cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm	Tue Mar 10 15:34:40 2009 +0000
+++ b/cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm	Wed Mar 11 05:36:29 2009 +0000
@@ -203,19 +203,6 @@ sub retry_delay {
     return ( 10, 30, 60, 300, 600 )[$fails];
 }
 
-sub do_authed_fetch {
-    my ( $opts, $mode, $startid, $numitems, $sess ) = @_;
-
-    # hit up the server with the specified information and return the raw content
-    my $ua = LWP::UserAgent->new;
-    my $request = HTTP::Request->new( GET => "http://$opts->{server}/export_comments.bml?get=$mode&startid=$startid&numitems=$numitems" );
-    $request->push_header( Cookie => "ljsession=$sess" );
-    my $response = $ua->request( $request );
-    return if $response->is_error;
-    my $xml = $response->content;
-    return $xml if $xml;
-}
-
 ##############################################################################
 # MASON DIXON LINE \o/
 # South of here, these functions have been updated with the changes Mark is
@@ -334,12 +321,14 @@ sub get_remapped_userids {
             unless defined $oid;
     }
 
-    unless ( defined $fid ) {
-        warn "[$$] Remapping feed userid of $data->{hostname}:$user\n";
-        $fid = $class->remap_username_feed( $data, $user );
-        warn "     FEED USERID IS STILL UNDEFINED\n"
-            unless defined $fid;
-    }
+# FIXME: this is temporarily disabled while we hash out exactly how we want
+# this functionality to work.
+#    unless ( defined $fid ) {
+#        warn "[$$] Remapping feed userid of $data->{hostname}:$user\n";
+#        $fid = $class->remap_username_feed( $data, $user );
+#        warn "     FEED USERID IS STILL UNDEFINED\n"
+#            unless defined $fid;
+#    }
 
     $dbh->do( 'REPLACE INTO import_usermap (hostname, username, identity_userid, feed_userid) VALUES (?, ?, ?, ?)',
               undef, $data->{hostname}, $user, $oid, $fid );
@@ -423,18 +412,13 @@ sub remap_lj_user {
 }
 
 sub get_lj_session {
-    my $imp = $_[0];
+    my ( $class, $imp ) = @_;
 
-    my $r = call_xmlrpc( $imp, 'sessiongenerate', { expiration => 'short' } );
+    my $r = $class->call_xmlrpc( $imp, 'sessiongenerate', { expiration => 'short' } );
     return undef
         unless $r && ! $r->{fault};
 
     return $r->{ljsession};
-}
-
-sub d {
-    warn shift(@_) . "\n"
-        if $LJ::IS_DEV_SERVER;
 }
 
 sub xmlrpc_call_helper {
diff -r 89cf10bae98e -r 6115e6b86caf cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm
--- a/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm	Tue Mar 10 15:34:40 2009 +0000
+++ b/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm	Wed Mar 11 05:36:29 2009 +0000
@@ -20,6 +20,7 @@ use base 'DW::Worker::ContentImporter::L
 use base 'DW::Worker::ContentImporter::LiveJournal';
 
 use Carp qw/ croak confess /;
+use Time::HiRes qw/ tv_interval gettimeofday /;
 use DW::Worker::ContentImporter::Local::Comments;
 
 # these come from LJ
@@ -40,215 +41,329 @@ sub try_work {
     my ( $class, $job ) = @_;
     my $opts = $job->arg;
     my $data = $class->import_data( $opts->{userid}, $opts->{import_data_id} );
+    my $begin_time = [ gettimeofday() ];
 
     # failure wrappers for convenience
     my $fail      = sub { return $class->fail( $data, 'lj_comments', $job, @_ ); };
     my $ok        = sub { return $class->ok( $data, 'lj_comments', $job ); };
     my $temp_fail = sub { return $class->temp_fail( $data, 'lj_comments', $job, @_ ); };
+    my $status    = sub { return $class->status( $data, 'lj_comments', { @_ } ); };
+
+    # logging sub
+    my ( $logfile, $last_log_time );
+    my $log = sub {
+        $last_log_time ||= [ gettimeofday() ];
+
+        unless ( $logfile ) {
+            mkdir "$LJ::HOME/logs/imports";
+            mkdir "$LJ::HOME/logs/imports/$opts->{userid}";
+            open $logfile, ">>$LJ::HOME/logs/imports/$opts->{userid}/$opts->{import_data_id}.lj_comments.$$"
+                or return $temp_fail->( 'Internal server error creating log.' );
+            print $logfile "[0.00s 0.00s] Log started at " . LJ::mysql_time(gmtime()) . ".\n";
+        }
+
+        my $fmt = "[%0.4fs %0.1fs] " . shift() . "\n";
+        my $msg = sprintf( $fmt, tv_interval( $last_log_time ), tv_interval( $begin_time), @_ );
+
+        print $logfile $msg;
+        $job->debug( $msg );
+
+        $last_log_time = [ gettimeofday() ];
+    };
 
     # setup
     my $u = LJ::load_userid( $data->{userid} )
         or return $fail->( 'Unable to load target with id %d.', $data->{userid} );
+    $log->( 'Import begun for %s(%d).', $u->user, $u->userid );
 
-    # temporary failure, this code hasn't been ported yet
-    return $fail->( 'oops, not ready yet' );
-}
+    # this will take a entry_map (old URL -> new jitemid) and convert it into a jitemid map (old jitemid -> new jitemid)
+    my $entry_map = DW::Worker::ContentImporter::Local::Entries->get_entry_map( $u ) || {};
+    $log->( 'Loaded entry map with %d entries.', scalar( keys %$entry_map ) );
 
-1;
-__END__
+    # now backfill into jitemid_map
+    my $jitemid_map = {};
+    foreach my $url ( keys %$entry_map ) {
+        my $jitemid = $1 >> 8
+            if $url =~ m!/(\d+)\.html$!;
+        $jitemid_map->{$jitemid} = $entry_map->{$url};
+    }
+    
+    # this will take a talk_map (old URL -> new jtalkid) and convert it to a jtalkid map (old jtalkid -> new jtalkid)
+    my $talk_map = DW::Worker::ContentImporter::Local::Comments->get_comment_map( $u ) || {};
+    $log->( 'Loaded comment map with %d entries.', scalar( keys %$talk_map ) );
 
-### WORK GOES HERE
-$opts->{identity_map} ||= {};
+    # now reverse it as above
+    my $jtalkid_map = {};
+    foreach my $url ( keys %$talk_map ) {
+        my $jtalkid = $1 >> 8
+            if $url =~ m!thread=(\d+)$!;
+        $jtalkid_map->{$jtalkid} = $talk_map->{$url};
+    }
 
-# this will take a entry_map (old URL -> new jitemid) and convert it into a jitemid map (old jitemid -> new jitemid)
-# TODO: Make sure you are dealing with the correct site.
-unless ( $opts->{jitemid_map} ) {
-    $opts->{entry_map} ||= DW::Worker::ContentImporter->get_entry_map($u,$opts);
-    $opts->{jitemid_map} = {};
-    foreach my $url ( keys %{$opts->{entry_map}} ) {
-        next unless $url =~ m/$opts->{user_path}/;
-        my ($ditemid) = $url =~ m/\/([0-9]+)\.html?$/;
-        my $jitemid = $ditemid >> 8;
-        $opts->{jitemid_map}->{$jitemid} = $opts->{entry_map}->{$url};
+    # parameters for below
+    my ( %meta, @userids, $identity_map );
+    my ( $maxid, $server_max_id, $server_next_id, $lasttag ) = ( 0, 0, 1, '' );
+    
+    # setup our parsing function
+    my $meta_handler = sub {
+        # this sub actually processes incoming meta information
+        $lasttag = $_[1];
+        shift; shift;      # remove the Expat object and tag name
+        my %temp = ( @_ ); # take the rest into our humble hash
+
+        # if we were last getting a comment, start storing the info
+        if ( $lasttag eq 'comment' ) {
+            # get some data on a comment
+            $meta{$temp{id}} = {
+                id => $temp{id},
+                posterid => $temp{posterid}+0,
+                state => $temp{state} || 'A',
+            };
+
+        } elsif ( $lasttag eq 'usermap' && ! exists $identity_map->{$temp{id}} ) {
+            push @userids, $temp{id};
+
+            my ( $local_oid, $local_fid ) = $class->get_remapped_userids( $data, $temp{user} );
+            $identity_map->{$temp{id}} = $local_oid;
+
+            $log->( 'Mapped remote %s(%d) to local userid %d.', $temp{user}, $temp{id}, $local_oid );
+        }
+    };
+    my $meta_closer = sub {
+        # we hit a closing tag so we're not in a tag anymore
+        $lasttag = '';
+    };
+    my $meta_content = sub {
+        # if we're in a maxid tag, we want to save that value so we know how much further
+        # we have to go in downloading meta info
+        return undef
+            unless $lasttag eq 'maxid' || 
+                   $lasttag eq 'nextid';
+
+        # save these values for later
+        $server_max_id = $_[1] + 0 if $lasttag eq 'maxid';
+        $server_next_id = $_[1] + 0 if $lasttag eq 'nextid';
+    };
+    
+    # hit up the server for metadata
+    while ( defined $server_next_id && $server_next_id =~ /^\d+$/ ) {
+        $log->( 'Fetching metadata; max_id = %d, next_id = %d.', $server_max_id || 0, $server_next_id || 0 );
+
+        my $content = $class->do_authed_comment_fetch(
+            $data, 'comment_meta', $server_next_id, $COMMENTS_FETCH_META
+        );
+        return $temp_fail->( 'Error fetching comment metadata from server.' )
+            unless $content;
+    
+        $server_next_id = undef;
+    
+        # now we want to XML parse this
+        my $parser = new XML::Parser(
+            Handlers => {
+                Start => $meta_handler,
+                Char  => $meta_content,
+                End   => $meta_closer
+            }
+        );
+        $parser->parse( $content );
     }
-}
+    $log->( 'Finished fetching metadata.' );
 
-# this will take a talk_map (old URL -> new jtalkid) and convert it to a jtalkid map (old jtalkid -> new jtalkid)
-# TODO: Make sure you are dealing with the correct site.
-unless ( $opts->{jtalkid_map} ) {
-    $opts->{talk_map} ||= DW::Worker::ContentImporter->get_comment_map( $u, $opts );
-    $opts->{jtalkid_map} = {};
-    foreach my $url ( keys %{$opts->{talk_map}} ) {
-        next unless $url =~ m/$opts->{user_path}/;
-        my ( $dtalkid ) = $url =~ m/\?thread=([0-9]+)$/;
-        my $jtalkid = $dtalkid >> 8;
-        $opts->{jtalkid_map}->{$jtalkid} = $opts->{talk_map}->{$url};
+    # body handling section now
+    my ( $lastid, $curid, @tags ) = ( 0, 0 );
+
+    # setup our handlers for body XML info
+    my $body_handler = sub {
+        # this sub actually processes incoming body information
+        $lasttag = $_[1];
+        push @tags, $lasttag;
+        shift; shift;      # remove the Expat object and tag name
+        my %temp = ( @_ ); # take the rest into our humble hash
+        if ( $lasttag eq 'comment' ) {
+            # get some data on a comment
+            $curid = $temp{id};
+            $meta{$curid}{parentid} = $temp{parentid}+0;
+            $meta{$curid}{jitemid} = $temp{jitemid}+0;
+        }
+    };
+    my $body_closer = sub {
+        # we hit a closing tag so we're not in a tag anymore
+        my $tag = pop @tags;
+        $lasttag = $tags[0];
+    };
+    my $body_content = sub {
+        # this grabs data inside of comments: body, subject, date
+        return unless $curid;
+        return unless $lasttag =~ /(?:body|subject|date)/;
+        $meta{$curid}{$lasttag} .= $_[1];
+        # have to .= it, because the parser will split on punctuation such as an apostrophe
+        # that may or may not be in the data stream, and we won't know until we've already
+        # gotten some data
+    };
+    
+    # start looping to fetch all of the comment bodies
+    while ( $lastid < $server_max_id ) {
+        $log->( 'Fetching bodydata; last_id = %d, max_id = %d.', $lastid || 0, $server_max_id || 0 );
+
+        my $content = $class->do_authed_comment_fetch(
+            $data, 'comment_body', $lastid+1, $COMMENTS_FETCH_BODY
+        );
+        return $temp_fail->( 'Error fetching comment body data from server.' )
+            unless $content;
+
+        # now we want to XML parse this
+        my $parser = new XML::Parser(
+            Handlers => {
+                Start => $body_handler,
+                Char  => $body_content,
+                End   => $body_closer
+            }
+        );
+        $parser->parse( $content );
+    
+        # the exporter should always return the maximum number of items, so loop again.  of course,
+        # this will fail nicely as soon as some site we're importing from reduces the max items
+        # they return due to load.  http://community.livejournal.com/changelog/5907095.html
+        $lastid += $COMMENTS_FETCH_BODY;
     }
-}
+    
+    # now iterate over each comment and build the nearly final structure
+    foreach my $comment ( values %meta ) {
 
-# downloaded meta data information
-my %meta;
-my @userids;
-
-# setup our parsing function
-my $maxid = 0;
-my $server_max_id = 0;
-my $server_next_id = 1;
-my $lasttag = '';
-my $meta_handler = sub {
-    # this sub actually processes incoming meta information
-    $lasttag = $_[1];
-    shift; shift;      # remove the Expat object and tag name
-    my %temp = ( @_ ); # take the rest into our humble hash
-    if ( $lasttag eq 'comment' ) {
-        # get some data on a comment
-        $meta{$temp{id}} = {
-            id => $temp{id},
-            posterid => $temp{posterid}+0,
-            state => $temp{state} || 'A',
-        };
-    } elsif ( $lasttag eq 'usermap' && !$opts->{identity_map}->{$temp{id}} ) {
-        push @userids, $temp{id};
-        $opts->{identity_map}->{$temp{id}} = remap_username_friend( $opts, $temp{user} );
-    }
-};
-my $meta_closer = sub {
-    # we hit a closing tag so we're not in a tag anymore
-    $lasttag = '';
-};
-my $meta_content = sub {
-    # if we're in a maxid tag, we want to save that value so we know how much further
-    # we have to go in downloading meta info
-    return unless ( $lasttag eq 'maxid' ) || ( $lasttag eq 'nextid' );
-    $server_max_id = $_[1] + 0 if ( $lasttag eq 'maxid' );
-    $server_next_id = $_[1] + 0 if ( $lasttag eq 'nextid' );
-};
-
-# hit up the server for metadata
-while ( defined $server_next_id && $server_next_id =~ /^\d+$/ ) {
-    DW::Worker::ContentImporter->ratelimit_request( $opts );
-    my $content = do_authed_fetch( $opts, 'comment_meta', $server_next_id, $COMMENTS_FETCH_META, $session );
-    #die "Some sort of error fetching metadata from server" unless $content;
-
-    $server_next_id = undef;
-
-    # now we want to XML parse this
-    my $parser = new XML::Parser( Handlers => { Start => $meta_handler, Char => $meta_content, End => $meta_closer } );
-    $parser->parse( $content );
-}
-
-# setup our handlers for body XML info
-my $lastid = 0;
-my $curid = 0;
-my @tags;
-my $body_handler = sub {
-    # this sub actually processes incoming body information
-    $lasttag = $_[1];
-    push @tags, $lasttag;
-    shift; shift;      # remove the Expat object and tag name
-    my %temp = ( @_ ); # take the rest into our humble hash
-    if ( $lasttag eq 'comment' ) {
-        # get some data on a comment
-        $curid = $temp{id};
-        $meta{$curid}{parentid} = $temp{parentid}+0;
-        $meta{$curid}{jitemid} = $temp{jitemid}+0;
-        # line below commented out because we shouldn't be trying to be clever like this ;p
-        # $lastid = $curid if $curid > $lastid;
-    }
-};
-my $body_closer = sub {
-    # we hit a closing tag so we're not in a tag anymore
-    my $tag = pop @tags;
-    $lasttag = $tags[0];
-};
-my $body_content = sub {
-    # this grabs data inside of comments: body, subject, date
-    return unless $curid;
-    return unless $lasttag =~ /(?:body|subject|date)/;
-    $meta{$curid}{$lasttag} .= $_[1];
-    # have to .= it, because the parser will split on punctuation such as an apostrophe
-    # that may or may not be in the data stream, and we won't know until we've already
-    # gotten some data
-};
-
-# at this point we have a fully regenerated metadata cache and we want to grab a block of comments
-while ( 1 ) {
-    DW::Worker::ContentImporter->ratelimit_request( $opts );
-    my $content = do_authed_fetch( $opts, 'comment_body', $lastid+1, $COMMENTS_FETCH_BODY, $session );
-
-    # now we want to XML parse this
-    my $parser = new XML::Parser( Handlers => { Start => $body_handler, Char => $body_content, End => $body_closer } );
-    $parser->parse( $content );
-
-    # now at this point what we have to decide whether we should loop again for more metadata
-    $lastid += $COMMENTS_FETCH_BODY;
-    last unless $lastid < $server_max_id;
-}
-
-foreach my $comment ( values %meta ) {
-    $comment->{posterid} = $opts->{identity_map}->{$comment->{posterid}};
-    $comment->{jitemid} = $opts->{jitemid_map}->{$comment->{jitemid}};
-
-    $comment->{unresolved} = 1 if ($comment->{parentid});
-
-    my $body = remap_lj_user($opts,$comment->{body});
-    $body =~ s/<.+?-embed-.+?>//g;
-    $body =~ s/<.+?-template-.+?>//g;
-    $comment->{body} = $body;
-
-    $comment->{orig_id} = $comment->{id};
-
-    if ($comment->{parentid} && $comment->{state} ne 'D') {
-        $meta{$comment->{parentid}}->{has_children} = 1;
-    }
-}
-
-my @to_import = sort { ( $a->{id}+0 ) <=> ( $b->{id}+0 ) } values %meta;
-my $had_unresolved = 1;
-# This loop should never need to run through more then once
-# but, it will *if* for some reason a comment comes before its parent
-# which *should* never happen, but I'm handling it anyway, just in case.
-while ($had_unresolved) {
-    $had_unresolved = 0;
-    my $ct = 0;
-    my $ct_unresolved = 0;
-    foreach my $comment (@to_import) {
-        next if $comment->{done}; # Skip this comment if it was already imported this round
-        next if $opts->{jtalkid_map}->{$comment->{orig_id}}; # Or on a previous import round
-        next if ( $comment->{state} eq 'D' && !$comment->{has_children} ); # Or if the comment is deleted, and child-less
-        $ct++;
-        if ( $comment->{unresolved} ) {
-            # lets see if this is resolvable at the moment
-            # A resolvable comment is a comment that's parent is already in the DW database
-            # and an unresolved comment is a comment that has a parent that is currently not in the database.
-            if ( $opts->{jtalkid_map}->{$comment->{parentid}} ) {
-                $comment->{parentid} = $opts->{jtalkid_map}->{$comment->{parentid}};
-                $comment->{unresolved} = 0;
-            }
-        }
-        if ( $comment->{unresolved} ) {
-            $ct_unresolved++;
-            $had_unresolved = 1;
+        # if we weren't able to map to a jitemid (last entry import a while ago?)
+        # or some other problem, log it and bail
+        unless ( $jitemid_map->{$comment->{jitemid}} ) {
+            $comment->{skip} = 1;
+            $log->( 'NO MAPPED ENTRY: remote values: jitemid %d, posterid %d, jtalkid %d.',
+                    $comment->{jitemid}, $comment->{posterid}, $comment->{id} );
             next;
         }
-        my $talkid = DW::Worker::ContentImporter->insert_comment( $u, $opts, $comment );
-        $opts->{jtalkid_map}->{$comment->{id}} = $talkid;
-        $comment->{id} = $talkid;
-        $comment->{done} = 1;
+
+        # basic mappings
+        $comment->{posterid} = $identity_map->{$comment->{posterid}};
+        $comment->{jitemid} = $jitemid_map->{$comment->{jitemid}};
+        $comment->{orig_id} = $comment->{id};
+    
+        # unresolved comments means we haven't got the parent in the database
+        # yet so we can't post this one
+        $comment->{unresolved} = 1
+            if $comment->{parentid};
+    
+        # the reverse of unresolved, tell the parent it has visible children
+        $meta{$comment->{parentid}}->{has_children} = 1
+            if $comment->{parentid} && $comment->{state} ne 'D';
+
+        # remap content (user links) then remove embeds/templates
+        my $body = $class->remap_lj_user( $data, $comment->{body} );
+        $body =~ s/<.+?-embed-.+?>/[Embedded content removed during import.]/g;
+        $body =~ s/<.+?-template-.+?>/[Templated content removed during import.]/g;
+        $comment->{body} = $body;
     }
-    # Sanity check. This *really* should never happen.
-    # This is here to prevent an endless loop, just in case.
-    # The only way I can see this firing is if a comment is just
-    # totally missing.
-    if ( $ct == $ct_unresolved && $had_unresolved ) {
-        # FIXME: Error
-        $had_unresolved = 0; # Set this to 0 so the loop falls through
+    
+    # variable setup for the database work
+    my @to_import = sort { ( $a->{id}+0 ) <=> ( $b->{id}+0 ) } values %meta;
+    my $had_unresolved = 1;
+
+    # This loop should never need to run through more then once
+    # but, it will *if* for some reason a comment comes before its parent
+    # which *should* never happen, but I'm handling it anyway, just in case.
+    while ( $had_unresolved ) {
+
+        # variables, and reset
+        my ( $ct, $ct_unresolved ) = ( 0, 0 );
+        $had_unresolved = 0;
+
+        # now doing imports!
+        foreach my $comment ( @to_import ) {
+            next if $comment->{skip};
+
+            $log->( "Attempting to import remote id %d, parentid %d, state %s.",
+                    $comment->{orig_id}, $comment->{parentid}, $comment->{state} );
+
+            # rules we might skip a content with
+            next if $comment->{done}; # Skip this comment if it was already imported this round
+            next if $jtalkid_map->{$comment->{orig_id}}; # Or on a previous import round
+            next if $comment->{state} eq 'D' && !$comment->{has_children}; # Or if the comment is deleted, and child-less
+
+            # now we know this one is going in the database
+            $ct++;
+
+            # try to resolve
+            if ( $comment->{unresolved} ) {
+                # lets see if this is resolvable at the moment
+                # A resolvable comment is a comment that's parent is already in the DW database
+                # and an unresolved comment is a comment that has a parent that is currently not in the database.
+                if ( $jtalkid_map->{$comment->{parentid}} ) {
+                    $comment->{parentid} = $jtalkid_map->{$comment->{parentid}};
+                    $comment->{unresolved} = 0;
+
+                    $log->( 'Resolved unresolved comment to local parentid %d.',
+                            $comment->{parentid} );
+
+                } else {
+                    # guess we couldn't resolve it :( next pass!
+                    $ct_unresolved++;
+                    $had_unresolved = 1;
+
+                    $log->( 'Failed to resolve comment.' );
+
+                    next;
+                }
+            }
+
+            # if we get here we're good to insert into the database
+            my $err = "";
+            my $talkid = DW::Worker::ContentImporter::Local::Comments->insert_comment( $u, $comment, \$err );
+            if ( $talkid ) {
+                $log->( 'Successfully imported source %d to new jtalkid %d.', $comment->{id}, $talkid );
+            } else {
+                $log->( 'Failed to import comment %d: %s.', $comment->{id}, $err );
+                return $temp_fail->( 'Failure importing comment: %s.', $err );
+            }
+
+            # store this information
+            $jtalkid_map->{$comment->{id}} = $talkid;
+            $comment->{id} = $talkid;
+            $comment->{done} = 1;
+        }
+
+        # Sanity check. This *really* should never happen.
+        # This is here to prevent an endless loop, just in case.
+        # The only way I can see this firing is if a comment is just
+        # totally missing.
+        if ( $ct == $ct_unresolved && $had_unresolved ) {
+            $log->( 'The unthinkable happened!  We hit an iceberg!!!' );
+            $log->( 'The above error: %d == %d && %d (had_unresolved).', $ct, $ct_unresolved, $had_unresolved );
+            return $fail->( 'Found unresolvable comment chain.' );
+        }
     }
-}
-$opts->{no_comments} = 1;
-
+    
     return $ok->();
 }
 
 
-1;
\ No newline at end of file
+sub do_authed_comment_fetch {
+    my ( $class, $data, $mode, $startid, $numitems ) = @_;
+
+    # if we don't have a session, then let's generate one
+    $data->{_session} ||= $class->get_lj_session( $data );
+
+    # hit up the server with the specified information and return the raw content
+    my $ua = LWP::UserAgent->new;
+    my $request = HTTP::Request->new( GET => "http://www.$data->{hostname}/export_comments.bml?get=$mode&startid=$startid&numitems=$numitems" );
+    $request->push_header( Cookie => "ljsession=$data->{_session}" );
+
+    # try to get the response
+    my $response = $ua->request( $request );
+    return if $response->is_error;
+
+    # now get the content
+    my $xml = $response->content;
+    return $xml if $xml;
+
+    # total failure...
+    return undef;
+}
+
+
+1;
diff -r 89cf10bae98e -r 6115e6b86caf cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm
--- a/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm	Tue Mar 10 15:34:40 2009 +0000
+++ b/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm	Wed Mar 11 05:36:29 2009 +0000
@@ -99,8 +99,23 @@ sub try_work {
         $log->( '    retrieved %d items and %d left to sync', $hash->{count}, $hash->{total} );
         last if $hash->{count} == $hash->{total};
     }
-    $log->( 'Syncitems finished.' );
+    $log->( 'Syncitems finished with %d items pre-prune.', scalar( keys %sync ) );
 
+    # this is an optimization.  since we never do an edit event (only post!) we will
+    # never get changes anyway.  so let's remove from the list of things to sync any
+    # post that we already know about.  (not that we really care, but it's much nicer
+    # on people we're pulling from.)
+    foreach my $url ( keys %$entry_map ) {
+        unless ( $url =~ m!/(\d+)\.html$! ) {
+            $log->( 'URL %s not of expected format in prune.', $url );
+            next;
+        }
+
+        delete $sync{$1 >> 8};
+    }
+    $log->( 'Syncitems now has %d items post-prune.', scalar( keys %sync ) );
+
+    # simple helper sub
     my $realtime = sub {
         my $id = shift;
         return $sync{$id}->[1] if @{$sync{$id} || []};
@@ -212,6 +227,15 @@ sub try_work {
         $log->( '    counted %d entries, lastgrab is now %s.', $count, $lastgrab );
     }
 
+    # mark the comments mode as ready to schedule
+    my $dbh = LJ::get_db_writer();
+    $dbh->do(
+        q{UPDATE import_items SET status = 'ready'
+          WHERE userid = ? AND item IN ('lj_comments')
+          AND import_data_id = ? AND status = 'init'},
+        undef, $u->id, $opts->{import_data_id}        
+    );
+
     return $ok->();
 }
 
diff -r 89cf10bae98e -r 6115e6b86caf cgi-bin/DW/Worker/ContentImporter/LiveJournal/FriendGroups.pm
--- a/cgi-bin/DW/Worker/ContentImporter/LiveJournal/FriendGroups.pm	Tue Mar 10 15:34:40 2009 +0000
+++ b/cgi-bin/DW/Worker/ContentImporter/LiveJournal/FriendGroups.pm	Wed Mar 11 05:36:29 2009 +0000
@@ -56,13 +56,14 @@ sub try_work {
 
     my $map = DW::Worker::ContentImporter::Local::TrustGroups->merge_trust_groups( $u, $r->{friendgroups} );
 
-    # mark lj_friends item as able to be scheduled now, and save the map
+    # store the merged map
     $dbh->do(
         q{UPDATE import_data SET groupmap = ?
           WHERE userid = ? AND import_data_id = ?},
         undef, nfreeze( $map ), $u->id, $opts->{import_data_id}
     );
 
+    # mark lj_friends item as able to be scheduled now, and save the map
 # FIXME: what do we do on error case? well, hopefully that will be rare...
     $dbh->do(
         q{UPDATE import_items SET status = 'ready'
diff -r 89cf10bae98e -r 6115e6b86caf cgi-bin/DW/Worker/ContentImporter/Local/Comments.pm
--- a/cgi-bin/DW/Worker/ContentImporter/Local/Comments.pm	Tue Mar 10 15:34:40 2009 +0000
+++ b/cgi-bin/DW/Worker/ContentImporter/Local/Comments.pm	Wed Mar 11 05:36:29 2009 +0000
@@ -26,7 +26,100 @@ DW::Worker::ContentImporter::Local::Comm
 
 These functions are part of the Saving API for comments.
 
+=head2 C<< $class->get_comment_map( $user, $hashref )
+
+Returns a hashref mapping import_source keys to jtalkids
+
 =cut
 
+sub get_comment_map {
+    my ( $class, $u ) = @_;
+
+    my $p = LJ::get_prop( "talk", "import_source" );
+    return {} unless $p;
+
+    my $dbr = LJ::get_cluster_reader( $u );
+    my %map;
+    my $sth = $dbr->prepare( "SELECT jtalkid, value FROM talkprop2 WHERE journalid = ? AND tpropid = ?" );
+
+    $sth->execute( $u->id, $p->{id} );
+
+    while ( my ( $jitemid, $value ) = $sth->fetchrow_array ) {
+        $map{$value} = $jitemid;
+    }
+
+    return \%map;
+}
+
+=head2 C<< $class->insert_comment( $u, $comment, $errref ) >>
+
+$comment is a hashref representation of a single comment, with the following format:
+
+  {
+    subject => "Comment",
+    body => 'I DID STUFF!!!!!',
+    posterid => $local_userid,
+
+    jitemid => $local_jitemid,
+
+    parentid => $local_parent,
+
+    state => 'A',
+  }
+
+$errref is a scalar reference to put any error text in.
+
+=cut
+
+sub insert_comment {
+    my ( $class, $u, $cmt, $errref ) = @_;
+    $errref ||= '';
+
+    # load the data we need to make this comment
+    use Data::Dumper;
+    warn Dumper( $cmt ) unless $cmt->{jitemid};
+
+    my $jitem = LJ::Entry->new( $u, jitemid => $cmt->{jitemid} );
+    my $source = $jitem->prop( "import_source" ) . "?thread=" . ( $cmt->{id} << 8 );
+    my $user = LJ::load_userid( $cmt->{posterid} )
+        if $cmt->{posterid};
+
+    # fix the XML timestamp to a useful timestamp
+    my $date = $cmt->{date};
+    $date =~ s/T/ /;
+    $date =~ s/Z//;
+
+    # sometimes the date is empty
+    # FIXME: why?  Dre had this, when can the date be empty?
+    $date ||= LJ::mysql_time();
+
+    # build the data structures we use.  we are sort of faking it here.
+    my $comment = {
+        subject => $cmt->{subject},
+        body => $cmt->{body},
+
+        state => $cmt->{state},
+        u => $user,
+
+        props => {
+            import_source => $source,
+        },
+
+        no_urls => 1,
+        no_esn => 1,
+    };
+
+    my $item = {
+        itemid => $cmt->{jitemid},
+    };
+
+    my $parent = {
+        talkid => $cmt->{parentid},
+    };
+
+    # now try to import it and return this as the error code
+    return LJ::Talk::Post::enter_imported_comment( $u, $parent, $item, $comment, $date, \$errref );
+}
+
 
 1;
diff -r 89cf10bae98e -r 6115e6b86caf htdocs/misc/import.bml
--- a/htdocs/misc/import.bml	Tue Mar 10 15:34:40 2009 +0000
+++ b/htdocs/misc/import.bml	Wed Mar 11 05:36:29 2009 +0000
@@ -59,6 +59,10 @@ body<=
         );
         return "Database error." if $dbh->err;
 
+        # if comments are on, turn entries on
+        $POST{lj_entries} = 1
+            if $POST{lj_comments};
+
         # okay, this is kinda hacky but turn on the right things so we can do
         # a proper entry import...
         if ( $POST{lj_entries} ) {
@@ -74,6 +78,7 @@ body<=
             ['lj_friendgroups', 'ready'],
 #            ['lj_friends',      'init' ],
             ['lj_entries',      'init' ],
+            ['lj_comments',     'init' ],
         );
 
         # schedule userpic, bio, and tag imports
@@ -163,6 +168,7 @@ EOF
 
     # checkbox easier this way
     my %opts = (
+        lj_comments     => [ 0, 'Import all comments.  Implies: entries.' ],
         lj_entries      => [ 0, 'Import all journal entries.  Implies: tags, friendgroups.' ],
         lj_tags         => [ 1, 'Import list of tags.' ],
         lj_userpics     => [ 1, 'Import icons (aka userpics).' ],
--------------------------------------------------------------------------------