mark: A photo of Mark kneeling on top of the Taal Volcano in the Philippines. It was a long hike. (Default)
Mark Smith ([staff profile] mark) wrote in [site community profile] changelog2009-08-11 05:17 am

[dw-free] crossposter: importing crossposted entries

[commit: http://hg.dwscoalition.org/dw-free/rev/3a86879e954b]

http://bugs.dwscoalition.org/show_bug.cgi?id=952

* [profile] exor64: Allow the importer to be aware of crossposted
entries, so we don't turn around and import those entries and create
duplicates.

* [staff profile] mark: Fix a bug where the manual advance mode of the entry
importer can sometimes loop. Also added error reporting/catching to some
functions.

Patch by [personal profile] exor674 and [staff profile] mark.

Files modified:
  • cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm
  • cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm
  • cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm
  • cgi-bin/DW/Worker/ContentImporter/Local/Comments.pm
  • cgi-bin/DW/Worker/ContentImporter/Local/Entries.pm
--------------------------------------------------------------------------------
diff -r 2036b42ff459 -r 3a86879e954b cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm
--- a/cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm	Tue Aug 11 03:15:05 2009 +0000
+++ b/cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm	Tue Aug 11 05:17:08 2009 +0000
@@ -27,6 +27,7 @@ use LWP::UserAgent;
 use LWP::UserAgent;
 use XMLRPC::Lite;
 use Digest::MD5 qw/ md5_hex /;
+use DW::External::Account;
 
 # storage for import related stuff
 our %MAPS;
@@ -258,6 +259,69 @@ sub get_lj_session {
     return $r->{ljsession};
 }
 
+=head2 C<< $class->get_xpost_map( $user, $hashref )
+
+Returns a hashref mapping jitemids to crossposted entries.
+
+=cut
+
+sub get_xpost_map {
+    my ( $class, $u, $data ) = @_;
+
+    # see if the account we're importing from is configured to crosspost
+    my $acct = $class->find_matching_acct( $u, $data );
+    return {} unless $acct;
+
+    # connect to the database and ready the sql
+    my $p = LJ::get_prop( log => 'xpost' )
+        or croak 'unable to get xpost logprop';
+    my $dbcr = LJ::get_cluster_reader( $u )
+        or croak 'unable to get user cluster reader';
+    my $sth = $dbcr->prepare( "SELECT jitemid, value FROM logprop2 WHERE journalid = ? AND propid = ?" )
+        or croak 'unable to prepare statement';
+
+    # now look up the values we need
+    $sth->execute( $u->id, $p->{id} );
+    croak 'database error: ' . $sth->errstr
+        if $sth->err;
+
+    # ( remote jitemid => local ditemid )
+    my %map;
+
+    # put together the mapping above
+    while ( my ( $jitemid, $value ) = $sth->fetchrow_array ) {
+        # decompose the xposter data
+        my $data = DW::External::Account->xpost_string_to_hash( $value );
+        my $xpost = $data->{$acct->acctid}
+            or next;
+
+        # this item was crossposted, record it
+        $map{$xpost} = $jitemid;
+    }
+
+    return \%map;
+}
+
+sub find_matching_acct {
+    my ( $class, $u, $data ) = @_;
+
+    my @accts = DW::External::Account->get_external_accounts($u);
+
+    my $dh = $data->{hostname};
+    $dh =~ s/^www\.//;
+
+    foreach my $acct (@accts) {
+        my $sh = $acct->serverhost;
+        $sh =~ s/^www\.//;
+
+        next unless lc( $sh ) eq lc( $dh );
+        next unless lc( $acct->username ) eq lc( $data->{username} );
+        return $acct;
+    }
+
+    return undef;
+}
+
 sub xmlrpc_call_helper {
     # helper function that makes life easier on folks that call xmlrpc stuff.  this handles
     # running the actual request and checking for errors, as well as handling the cases where
@@ -266,7 +330,7 @@ sub xmlrpc_call_helper {
 
     # bail if depth is 4, obviously something is going terribly wrong
     if ( $depth >= 4 ) {
-        return 
+        return
             {
                 fault => 1,
                 faultString => 'Exceeded XMLRPC recursion limit.',
@@ -277,7 +341,7 @@ sub xmlrpc_call_helper {
     my $res;
     eval { $res = $xmlrpc->call($method, $req); };
     if ( $res && $res->fault ) {
-        return 
+        return
             {
                 fault => 1,
                 faultString => $res->fault->{faultString} || 'Unknown error.',
diff -r 2036b42ff459 -r 3a86879e954b cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm
--- a/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm	Tue Aug 11 03:15:05 2009 +0000
+++ b/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm	Tue Aug 11 05:17:08 2009 +0000
@@ -99,7 +99,12 @@ sub try_work {
     my $entry_map = DW::Worker::ContentImporter::Local::Entries->get_entry_map( $u ) || {};
     $log->( 'Loaded entry map with %d entries.', scalar( keys %$entry_map ) );
 
+    # and xpost map
+    my $xpost_map = $class->get_xpost_map( $u, $data ) || {};
+    $log->( 'Loaded xpost map with %d entries.', scalar( keys %$xpost_map ) );
+
     # now backfill into jitemid_map
+    my $entry_source = {};
     my $jitemid_map = {};
     $log->( 'Filtering parameters: hostname=[%s], username=[%s].', $data->{hostname}, $data->{username} );
     foreach my $url ( keys %$entry_map ) {
@@ -113,6 +118,12 @@ sub try_work {
         my $jitemid = $1 >> 8
             if $url =~ m!/(\d+)\.html$!;
         $jitemid_map->{$jitemid} = $entry_map->{$url};
+        $entry_source->{$jitemid_map->{$jitemid}} = $url;
+    }
+
+    foreach my $jitemid ( keys %$xpost_map ) {
+        $jitemid_map->{$jitemid} = $xpost_map->{$jitemid};
+        $entry_source->{$jitemid_map->{$jitemid}} = "CROSSPOSTER " . $data->{hostname} . " " . $data->{username} . " $jitemid "
     }
 
     # this will take a talk_map (old URL -> new jtalkid) and convert it to a jtalkid map (old jtalkid -> new jtalkid)
@@ -316,6 +327,8 @@ sub try_work {
         $comment->{jitemid} = $jitemid_map->{$comment->{jitemid}};
         $comment->{orig_id} = $comment->{id};
 
+        $comment->{entry_source} = $entry_source->{$comment->{jitemid}};
+
         # unresolved comments means we haven't got the parent in the database
         # yet so we can't post this one
         $comment->{unresolved} = 1
diff -r 2036b42ff459 -r 3a86879e954b cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm
--- a/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm	Tue Aug 11 03:15:05 2009 +0000
+++ b/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm	Tue Aug 11 05:17:08 2009 +0000
@@ -97,6 +97,10 @@ sub try_work {
     my $entry_map = DW::Worker::ContentImporter::Local::Entries->get_entry_map( $u ) || {};
     $log->( 'Loaded entry map with %d entries.', scalar( keys %$entry_map ) );
 
+    # and xpost map
+    my $xpost_map = $class->get_xpost_map( $u, $data ) || {};
+    $log->( 'Loaded xpost map with %d entries.', scalar( keys %$xpost_map ) );
+
     # this is a helper sub that steps a MySQL formatted time by some offset
     # arguments: '2008-01-01 12:03:53', -1 ... returns '2008-01-01 12:03:52'
     my $step_time = sub {
@@ -156,7 +160,15 @@ sub try_work {
 
         delete $sync{$1 >> 8};
     }
-    $log->( 'Syncitems now has %d items post-prune.', scalar( keys %sync ) );
+    $log->( 'Syncitems now has %d items post-prune (first pass).', scalar( keys %sync ) );
+
+    # this is another optimization.  we know crossposted entries can be removed from
+    # the list of things we will import, as we generated them to begin with.
+    foreach my $itemid ( keys %$xpost_map ) {
+        delete $sync{$itemid};
+    }
+    $log->( 'Syncitems now has %d items post-prune (second pass).', scalar( keys %sync ) );
+
     $title->( 'post-prune' );
 
     # simple helper sub
@@ -179,9 +191,11 @@ sub try_work {
             $evt->{key} = $evt->{url};
 
             # skip this if we've already dealt with it before
-            $log->( '    %d %s %s; mapped = %d.', $evt->{itemid}, $evt->{url}, $evt->{realtime}, $entry_map->{$evt->{key}} );
+            $log->( '    %d %s %s; mapped = %d (import_source) || %d (xpost).',
+                    $evt->{itemid}, $evt->{url}, $evt->{realtime}, $entry_map->{$evt->{key}},
+                    $xpost_map->{$evt->{itemid}} );
             my $sync = delete $sync{$evt->{itemid}};
-            return if $entry_map->{$evt->{key}} || !defined $sync;
+            return if $entry_map->{$evt->{key}} || !defined $sync || $xpost_map->{$evt->{itemid}};
 
             # clean up event for LJ
             my @item_errors;
@@ -235,6 +249,11 @@ sub try_work {
         # calculate what time to get entries for
         my ( $tries, $lastgrab, $hash ) = ( 0, undef, undef );
 SYNC:   while ( $tries++ <= 10 ) {
+
+            # if we ever get in here with no entries left, we're done.  this sometimes happens
+            # when the manual advance import code hits the end of the sidewalk and runs out of
+            # things to import.
+            last unless keys %sync;
 
             # calculate the oldest entry we haven't retrieved yet, and offset that time by
             # $tries, so we can break the 'broken client' logic (note: we assert that we are
diff -r 2036b42ff459 -r 3a86879e954b cgi-bin/DW/Worker/ContentImporter/Local/Comments.pm
--- a/cgi-bin/DW/Worker/ContentImporter/Local/Comments.pm	Tue Aug 11 03:15:05 2009 +0000
+++ b/cgi-bin/DW/Worker/ContentImporter/Local/Comments.pm	Tue Aug 11 05:17:08 2009 +0000
@@ -81,7 +81,7 @@ sub insert_comment {
     warn Dumper( $cmt ) unless $cmt->{jitemid};
 
     my $jitem = LJ::Entry->new( $u, jitemid => $cmt->{jitemid} );
-    my $source = $jitem->prop( "import_source" ) . "?thread=" . ( $cmt->{id} << 8 );
+    my $source = ( $cmt->{entry_source} || $jitem->prop( "import_source" ) ) . "?thread=" . ( $cmt->{id} << 8 );
     my $user = LJ::load_userid( $cmt->{posterid} )
         if $cmt->{posterid};
 
diff -r 2036b42ff459 -r 3a86879e954b cgi-bin/DW/Worker/ContentImporter/Local/Entries.pm
--- a/cgi-bin/DW/Worker/ContentImporter/Local/Entries.pm	Tue Aug 11 03:15:05 2009 +0000
+++ b/cgi-bin/DW/Worker/ContentImporter/Local/Entries.pm	Tue Aug 11 05:17:08 2009 +0000
@@ -18,6 +18,8 @@ package DW::Worker::ContentImporter::Loc
 package DW::Worker::ContentImporter::Local::Entries;
 use strict;
 
+use Carp qw/ croak /;
+
 =head1 NAME
 
 DW::Worker::ContentImporter::Local::Entries - Local data utilities for entries
@@ -35,19 +37,21 @@ sub get_entry_map {
 sub get_entry_map {
     my ( $class, $u ) = @_;
 
-    my $p = LJ::get_prop( "log", "import_source" );
-    return {} unless $p;
-
-    my $dbr = LJ::get_cluster_reader( $u );
-    my %map;
-    my $sth = $dbr->prepare( "SELECT jitemid, value FROM logprop2 WHERE journalid = ? AND propid = ?" );
+    my $p = LJ::get_prop( log => 'import_source' )
+        or croak 'unable to load logprop';
+    my $dbcr = LJ::get_cluster_reader( $u )
+        or croak 'unable to connect to database';
+    my $sth = $dbcr->prepare( "SELECT jitemid, value FROM logprop2 WHERE journalid = ? AND propid = ?" )
+        or croak 'unable to prepare SQL';
 
     $sth->execute( $u->id, $p->{id} );
+    croak 'database error: ' . $sth->errstr
+        if $sth->err;
 
+    my %map;
     while ( my ( $jitemid, $value ) = $sth->fetchrow_array ) {
         $map{$value} = $jitemid;
     }
-
     return \%map;
 }
 
--------------------------------------------------------------------------------

Post a comment in response:

This account has disabled anonymous posting.
If you don't have an account you can create one now.
HTML doesn't work in the subject.
More info about formatting

If you are unable to use this captcha for any reason, please contact us by email at support@dreamwidth.org