mark: A photo of Mark kneeling on top of the Taal Volcano in the Philippines. It was a long hike. (Default)
Mark Smith ([staff profile] mark) wrote in [site community profile] changelog2009-08-11 05:17 am

[dw-free] crossposter: importing crossposted entries

[commit: http://hg.dwscoalition.org/dw-free/rev/3a86879e954b]

http://bugs.dwscoalition.org/show_bug.cgi?id=952

* [profile] exor64: Allow the importer to be aware of crossposted
entries, so we don't turn around and import those entries and create
duplicates.

* [staff profile] mark: Fix a bug where the manual advance mode of the entry
importer can sometimes loop. Also added error reporting/catching to some
functions.

Patch by [personal profile] exor674 and [staff profile] mark.

Files modified:
  • cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm
  • cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm
  • cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm
  • cgi-bin/DW/Worker/ContentImporter/Local/Comments.pm
  • cgi-bin/DW/Worker/ContentImporter/Local/Entries.pm
--------------------------------------------------------------------------------
diff -r 2036b42ff459 -r 3a86879e954b cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm
--- a/cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm	Tue Aug 11 03:15:05 2009 +0000
+++ b/cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm	Tue Aug 11 05:17:08 2009 +0000
@@ -27,6 +27,7 @@ use LWP::UserAgent;
 use LWP::UserAgent;
 use XMLRPC::Lite;
 use Digest::MD5 qw/ md5_hex /;
+use DW::External::Account;
 
 # storage for import related stuff
 our %MAPS;
@@ -258,6 +259,69 @@ sub get_lj_session {
     return $r->{ljsession};
 }
 
+=head2 C<< $class->get_xpost_map( $user, $hashref )
+
+Returns a hashref mapping jitemids to crossposted entries.
+
+=cut
+
+sub get_xpost_map {
+    my ( $class, $u, $data ) = @_;
+
+    # see if the account we're importing from is configured to crosspost
+    my $acct = $class->find_matching_acct( $u, $data );
+    return {} unless $acct;
+
+    # connect to the database and ready the sql
+    my $p = LJ::get_prop( log => 'xpost' )
+        or croak 'unable to get xpost logprop';
+    my $dbcr = LJ::get_cluster_reader( $u )
+        or croak 'unable to get user cluster reader';
+    my $sth = $dbcr->prepare( "SELECT jitemid, value FROM logprop2 WHERE journalid = ? AND propid = ?" )
+        or croak 'unable to prepare statement';
+
+    # now look up the values we need
+    $sth->execute( $u->id, $p->{id} );
+    croak 'database error: ' . $sth->errstr
+        if $sth->err;
+
+    # ( remote jitemid => local ditemid )
+    my %map;
+
+    # put together the mapping above
+    while ( my ( $jitemid, $value ) = $sth->fetchrow_array ) {
+        # decompose the xposter data
+        my $data = DW::External::Account->xpost_string_to_hash( $value );
+        my $xpost = $data->{$acct->acctid}
+            or next;
+
+        # this item was crossposted, record it
+        $map{$xpost} = $jitemid;
+    }
+
+    return \%map;
+}
+
+sub find_matching_acct {
+    my ( $class, $u, $data ) = @_;
+
+    my @accts = DW::External::Account->get_external_accounts($u);
+
+    my $dh = $data->{hostname};
+    $dh =~ s/^www\.//;
+
+    foreach my $acct (@accts) {
+        my $sh = $acct->serverhost;
+        $sh =~ s/^www\.//;
+
+        next unless lc( $sh ) eq lc( $dh );
+        next unless lc( $acct->username ) eq lc( $data->{username} );
+        return $acct;
+    }
+
+    return undef;
+}
+
 sub xmlrpc_call_helper {
     # helper function that makes life easier on folks that call xmlrpc stuff.  this handles
     # running the actual request and checking for errors, as well as handling the cases where
@@ -266,7 +330,7 @@ sub xmlrpc_call_helper {
 
     # bail if depth is 4, obviously something is going terribly wrong
     if ( $depth >= 4 ) {
-        return 
+        return
             {
                 fault => 1,
                 faultString => 'Exceeded XMLRPC recursion limit.',
@@ -277,7 +341,7 @@ sub xmlrpc_call_helper {
     my $res;
     eval { $res = $xmlrpc->call($method, $req); };
     if ( $res && $res->fault ) {
-        return 
+        return
             {
                 fault => 1,
                 faultString => $res->fault->{faultString} || 'Unknown error.',
diff -r 2036b42ff459 -r 3a86879e954b cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm
--- a/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm	Tue Aug 11 03:15:05 2009 +0000
+++ b/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm	Tue Aug 11 05:17:08 2009 +0000
@@ -99,7 +99,12 @@ sub try_work {
     my $entry_map = DW::Worker::ContentImporter::Local::Entries->get_entry_map( $u ) || {};
     $log->( 'Loaded entry map with %d entries.', scalar( keys %$entry_map ) );
 
+    # and xpost map
+    my $xpost_map = $class->get_xpost_map( $u, $data ) || {};
+    $log->( 'Loaded xpost map with %d entries.', scalar( keys %$xpost_map ) );
+
     # now backfill into jitemid_map
+    my $entry_source = {};
     my $jitemid_map = {};
     $log->( 'Filtering parameters: hostname=[%s], username=[%s].', $data->{hostname}, $data->{username} );
     foreach my $url ( keys %$entry_map ) {
@@ -113,6 +118,12 @@ sub try_work {
         my $jitemid = $1 >> 8
             if $url =~ m!/(\d+)\.html$!;
         $jitemid_map->{$jitemid} = $entry_map->{$url};
+        $entry_source->{$jitemid_map->{$jitemid}} = $url;
+    }
+
+    foreach my $jitemid ( keys %$xpost_map ) {
+        $jitemid_map->{$jitemid} = $xpost_map->{$jitemid};
+        $entry_source->{$jitemid_map->{$jitemid}} = "CROSSPOSTER " . $data->{hostname} . " " . $data->{username} . " $jitemid "
     }
 
     # this will take a talk_map (old URL -> new jtalkid) and convert it to a jtalkid map (old jtalkid -> new jtalkid)
@@ -316,6 +327,8 @@ sub try_work {
         $comment->{jitemid} = $jitemid_map->{$comment->{jitemid}};
         $comment->{orig_id} = $comment->{id};
 
+        $comment->{entry_source} = $entry_source->{$comment->{jitemid}};
+
         # unresolved comments means we haven't got the parent in the database
         # yet so we can't post this one
         $comment->{unresolved} = 1
diff -r 2036b42ff459 -r 3a86879e954b cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm
--- a/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm	Tue Aug 11 03:15:05 2009 +0000
+++ b/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm	Tue Aug 11 05:17:08 2009 +0000
@@ -97,6 +97,10 @@ sub try_work {
     my $entry_map = DW::Worker::ContentImporter::Local::Entries->get_entry_map( $u ) || {};
     $log->( 'Loaded entry map with %d entries.', scalar( keys %$entry_map ) );
 
+    # and xpost map
+    my $xpost_map = $class->get_xpost_map( $u, $data ) || {};
+    $log->( 'Loaded xpost map with %d entries.', scalar( keys %$xpost_map ) );
+
     # this is a helper sub that steps a MySQL formatted time by some offset
     # arguments: '2008-01-01 12:03:53', -1 ... returns '2008-01-01 12:03:52'
     my $step_time = sub {
@@ -156,7 +160,15 @@ sub try_work {
 
         delete $sync{$1 >> 8};
     }
-    $log->( 'Syncitems now has %d items post-prune.', scalar( keys %sync ) );
+    $log->( 'Syncitems now has %d items post-prune (first pass).', scalar( keys %sync ) );
+
+    # this is another optimization.  we know crossposted entries can be removed from
+    # the list of things we will import, as we generated them to begin with.
+    foreach my $itemid ( keys %$xpost_map ) {
+        delete $sync{$itemid};
+    }
+    $log->( 'Syncitems now has %d items post-prune (second pass).', scalar( keys %sync ) );
+
     $title->( 'post-prune' );
 
     # simple helper sub
@@ -179,9 +191,11 @@ sub try_work {
             $evt->{key} = $evt->{url};
 
             # skip this if we've already dealt with it before
-            $log->( '    %d %s %s; mapped = %d.', $evt->{itemid}, $evt->{url}, $evt->{realtime}, $entry_map->{$evt->{key}} );
+            $log->( '    %d %s %s; mapped = %d (import_source) || %d (xpost).',
+                    $evt->{itemid}, $evt->{url}, $evt->{realtime}, $entry_map->{$evt->{key}},
+                    $xpost_map->{$evt->{itemid}} );
             my $sync = delete $sync{$evt->{itemid}};
-            return if $entry_map->{$evt->{key}} || !defined $sync;
+            return if $entry_map->{$evt->{key}} || !defined $sync || $xpost_map->{$evt->{itemid}};
 
             # clean up event for LJ
             my @item_errors;
@@ -235,6 +249,11 @@ sub try_work {
         # calculate what time to get entries for
         my ( $tries, $lastgrab, $hash ) = ( 0, undef, undef );
 SYNC:   while ( $tries++ <= 10 ) {
+
+            # if we ever get in here with no entries left, we're done.  this sometimes happens
+            # when the manual advance import code hits the end of the sidewalk and runs out of
+            # things to import.
+            last unless keys %sync;
 
             # calculate the oldest entry we haven't retrieved yet, and offset that time by
             # $tries, so we can break the 'broken client' logic (note: we assert that we are
diff -r 2036b42ff459 -r 3a86879e954b cgi-bin/DW/Worker/ContentImporter/Local/Comments.pm
--- a/cgi-bin/DW/Worker/ContentImporter/Local/Comments.pm	Tue Aug 11 03:15:05 2009 +0000
+++ b/cgi-bin/DW/Worker/ContentImporter/Local/Comments.pm	Tue Aug 11 05:17:08 2009 +0000
@@ -81,7 +81,7 @@ sub insert_comment {
     warn Dumper( $cmt ) unless $cmt->{jitemid};
 
     my $jitem = LJ::Entry->new( $u, jitemid => $cmt->{jitemid} );
-    my $source = $jitem->prop( "import_source" ) . "?thread=" . ( $cmt->{id} << 8 );
+    my $source = ( $cmt->{entry_source} || $jitem->prop( "import_source" ) ) . "?thread=" . ( $cmt->{id} << 8 );
     my $user = LJ::load_userid( $cmt->{posterid} )
         if $cmt->{posterid};
 
diff -r 2036b42ff459 -r 3a86879e954b cgi-bin/DW/Worker/ContentImporter/Local/Entries.pm
--- a/cgi-bin/DW/Worker/ContentImporter/Local/Entries.pm	Tue Aug 11 03:15:05 2009 +0000
+++ b/cgi-bin/DW/Worker/ContentImporter/Local/Entries.pm	Tue Aug 11 05:17:08 2009 +0000
@@ -18,6 +18,8 @@ package DW::Worker::ContentImporter::Loc
 package DW::Worker::ContentImporter::Local::Entries;
 use strict;
 
+use Carp qw/ croak /;
+
 =head1 NAME
 
 DW::Worker::ContentImporter::Local::Entries - Local data utilities for entries
@@ -35,19 +37,21 @@ sub get_entry_map {
 sub get_entry_map {
     my ( $class, $u ) = @_;
 
-    my $p = LJ::get_prop( "log", "import_source" );
-    return {} unless $p;
-
-    my $dbr = LJ::get_cluster_reader( $u );
-    my %map;
-    my $sth = $dbr->prepare( "SELECT jitemid, value FROM logprop2 WHERE journalid = ? AND propid = ?" );
+    my $p = LJ::get_prop( log => 'import_source' )
+        or croak 'unable to load logprop';
+    my $dbcr = LJ::get_cluster_reader( $u )
+        or croak 'unable to connect to database';
+    my $sth = $dbcr->prepare( "SELECT jitemid, value FROM logprop2 WHERE journalid = ? AND propid = ?" )
+        or croak 'unable to prepare SQL';
 
     $sth->execute( $u->id, $p->{id} );
+    croak 'database error: ' . $sth->errstr
+        if $sth->err;
 
+    my %map;
     while ( my ( $jitemid, $value ) = $sth->fetchrow_array ) {
         $map{$value} = $jitemid;
     }
-
     return \%map;
 }
 
--------------------------------------------------------------------------------
yvi: Kaylee half-smiling, looking very pretty (Default)

[personal profile] yvi 2009-08-11 07:24 am (UTC)(link)
Oh, this looks like it will be well-received :)
cesy: "Cesy" - An old-fashioned quill and ink (Default)

[personal profile] cesy 2009-08-12 10:00 am (UTC)(link)
I am really looking forward to being able to import the comments on cross-posted entries.