[dw-free] crossposter: importing crossposted entries
[commit: http://hg.dwscoalition.org/dw-free/rev/3a86879e954b]
http://bugs.dwscoalition.org/show_bug.cgi?id=952
*
exor64: Allow the importer to be aware of crossposted
entries, so we don't turn around and import those entries and create
duplicates.
*
mark: Fix a bug where the manual advance mode of the entry
importer can sometimes loop. Also added error reporting/catching to some
functions.
Patch by
exor674 and
mark.
Files modified:
http://bugs.dwscoalition.org/show_bug.cgi?id=952
*
![[profile]](https://www.dreamwidth.org/img/silk/identity/user.png)
entries, so we don't turn around and import those entries and create
duplicates.
*
![[staff profile]](https://www.dreamwidth.org/img/silk/identity/user_staff.png)
importer can sometimes loop. Also added error reporting/catching to some
functions.
Patch by
![[personal profile]](https://www.dreamwidth.org/img/silk/identity/user.png)
![[staff profile]](https://www.dreamwidth.org/img/silk/identity/user_staff.png)
Files modified:
- cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm
- cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm
- cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm
- cgi-bin/DW/Worker/ContentImporter/Local/Comments.pm
- cgi-bin/DW/Worker/ContentImporter/Local/Entries.pm
-------------------------------------------------------------------------------- diff -r 2036b42ff459 -r 3a86879e954b cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm --- a/cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm Tue Aug 11 03:15:05 2009 +0000 +++ b/cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm Tue Aug 11 05:17:08 2009 +0000 @@ -27,6 +27,7 @@ use LWP::UserAgent; use LWP::UserAgent; use XMLRPC::Lite; use Digest::MD5 qw/ md5_hex /; +use DW::External::Account; # storage for import related stuff our %MAPS; @@ -258,6 +259,69 @@ sub get_lj_session { return $r->{ljsession}; } +=head2 C<< $class->get_xpost_map( $user, $hashref ) + +Returns a hashref mapping jitemids to crossposted entries. + +=cut + +sub get_xpost_map { + my ( $class, $u, $data ) = @_; + + # see if the account we're importing from is configured to crosspost + my $acct = $class->find_matching_acct( $u, $data ); + return {} unless $acct; + + # connect to the database and ready the sql + my $p = LJ::get_prop( log => 'xpost' ) + or croak 'unable to get xpost logprop'; + my $dbcr = LJ::get_cluster_reader( $u ) + or croak 'unable to get user cluster reader'; + my $sth = $dbcr->prepare( "SELECT jitemid, value FROM logprop2 WHERE journalid = ? AND propid = ?" ) + or croak 'unable to prepare statement'; + + # now look up the values we need + $sth->execute( $u->id, $p->{id} ); + croak 'database error: ' . $sth->errstr + if $sth->err; + + # ( remote jitemid => local ditemid ) + my %map; + + # put together the mapping above + while ( my ( $jitemid, $value ) = $sth->fetchrow_array ) { + # decompose the xposter data + my $data = DW::External::Account->xpost_string_to_hash( $value ); + my $xpost = $data->{$acct->acctid} + or next; + + # this item was crossposted, record it + $map{$xpost} = $jitemid; + } + + return \%map; +} + +sub find_matching_acct { + my ( $class, $u, $data ) = @_; + + my @accts = DW::External::Account->get_external_accounts($u); + + my $dh = $data->{hostname}; + $dh =~ s/^www\.//; + + foreach my $acct (@accts) { + my $sh = $acct->serverhost; + $sh =~ s/^www\.//; + + next unless lc( $sh ) eq lc( $dh ); + next unless lc( $acct->username ) eq lc( $data->{username} ); + return $acct; + } + + return undef; +} + sub xmlrpc_call_helper { # helper function that makes life easier on folks that call xmlrpc stuff. this handles # running the actual request and checking for errors, as well as handling the cases where @@ -266,7 +330,7 @@ sub xmlrpc_call_helper { # bail if depth is 4, obviously something is going terribly wrong if ( $depth >= 4 ) { - return + return { fault => 1, faultString => 'Exceeded XMLRPC recursion limit.', @@ -277,7 +341,7 @@ sub xmlrpc_call_helper { my $res; eval { $res = $xmlrpc->call($method, $req); }; if ( $res && $res->fault ) { - return + return { fault => 1, faultString => $res->fault->{faultString} || 'Unknown error.', diff -r 2036b42ff459 -r 3a86879e954b cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm --- a/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm Tue Aug 11 03:15:05 2009 +0000 +++ b/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm Tue Aug 11 05:17:08 2009 +0000 @@ -99,7 +99,12 @@ sub try_work { my $entry_map = DW::Worker::ContentImporter::Local::Entries->get_entry_map( $u ) || {}; $log->( 'Loaded entry map with %d entries.', scalar( keys %$entry_map ) ); + # and xpost map + my $xpost_map = $class->get_xpost_map( $u, $data ) || {}; + $log->( 'Loaded xpost map with %d entries.', scalar( keys %$xpost_map ) ); + # now backfill into jitemid_map + my $entry_source = {}; my $jitemid_map = {}; $log->( 'Filtering parameters: hostname=[%s], username=[%s].', $data->{hostname}, $data->{username} ); foreach my $url ( keys %$entry_map ) { @@ -113,6 +118,12 @@ sub try_work { my $jitemid = $1 >> 8 if $url =~ m!/(\d+)\.html$!; $jitemid_map->{$jitemid} = $entry_map->{$url}; + $entry_source->{$jitemid_map->{$jitemid}} = $url; + } + + foreach my $jitemid ( keys %$xpost_map ) { + $jitemid_map->{$jitemid} = $xpost_map->{$jitemid}; + $entry_source->{$jitemid_map->{$jitemid}} = "CROSSPOSTER " . $data->{hostname} . " " . $data->{username} . " $jitemid " } # this will take a talk_map (old URL -> new jtalkid) and convert it to a jtalkid map (old jtalkid -> new jtalkid) @@ -316,6 +327,8 @@ sub try_work { $comment->{jitemid} = $jitemid_map->{$comment->{jitemid}}; $comment->{orig_id} = $comment->{id}; + $comment->{entry_source} = $entry_source->{$comment->{jitemid}}; + # unresolved comments means we haven't got the parent in the database # yet so we can't post this one $comment->{unresolved} = 1 diff -r 2036b42ff459 -r 3a86879e954b cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm --- a/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm Tue Aug 11 03:15:05 2009 +0000 +++ b/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm Tue Aug 11 05:17:08 2009 +0000 @@ -97,6 +97,10 @@ sub try_work { my $entry_map = DW::Worker::ContentImporter::Local::Entries->get_entry_map( $u ) || {}; $log->( 'Loaded entry map with %d entries.', scalar( keys %$entry_map ) ); + # and xpost map + my $xpost_map = $class->get_xpost_map( $u, $data ) || {}; + $log->( 'Loaded xpost map with %d entries.', scalar( keys %$xpost_map ) ); + # this is a helper sub that steps a MySQL formatted time by some offset # arguments: '2008-01-01 12:03:53', -1 ... returns '2008-01-01 12:03:52' my $step_time = sub { @@ -156,7 +160,15 @@ sub try_work { delete $sync{$1 >> 8}; } - $log->( 'Syncitems now has %d items post-prune.', scalar( keys %sync ) ); + $log->( 'Syncitems now has %d items post-prune (first pass).', scalar( keys %sync ) ); + + # this is another optimization. we know crossposted entries can be removed from + # the list of things we will import, as we generated them to begin with. + foreach my $itemid ( keys %$xpost_map ) { + delete $sync{$itemid}; + } + $log->( 'Syncitems now has %d items post-prune (second pass).', scalar( keys %sync ) ); + $title->( 'post-prune' ); # simple helper sub @@ -179,9 +191,11 @@ sub try_work { $evt->{key} = $evt->{url}; # skip this if we've already dealt with it before - $log->( ' %d %s %s; mapped = %d.', $evt->{itemid}, $evt->{url}, $evt->{realtime}, $entry_map->{$evt->{key}} ); + $log->( ' %d %s %s; mapped = %d (import_source) || %d (xpost).', + $evt->{itemid}, $evt->{url}, $evt->{realtime}, $entry_map->{$evt->{key}}, + $xpost_map->{$evt->{itemid}} ); my $sync = delete $sync{$evt->{itemid}}; - return if $entry_map->{$evt->{key}} || !defined $sync; + return if $entry_map->{$evt->{key}} || !defined $sync || $xpost_map->{$evt->{itemid}}; # clean up event for LJ my @item_errors; @@ -235,6 +249,11 @@ sub try_work { # calculate what time to get entries for my ( $tries, $lastgrab, $hash ) = ( 0, undef, undef ); SYNC: while ( $tries++ <= 10 ) { + + # if we ever get in here with no entries left, we're done. this sometimes happens + # when the manual advance import code hits the end of the sidewalk and runs out of + # things to import. + last unless keys %sync; # calculate the oldest entry we haven't retrieved yet, and offset that time by # $tries, so we can break the 'broken client' logic (note: we assert that we are diff -r 2036b42ff459 -r 3a86879e954b cgi-bin/DW/Worker/ContentImporter/Local/Comments.pm --- a/cgi-bin/DW/Worker/ContentImporter/Local/Comments.pm Tue Aug 11 03:15:05 2009 +0000 +++ b/cgi-bin/DW/Worker/ContentImporter/Local/Comments.pm Tue Aug 11 05:17:08 2009 +0000 @@ -81,7 +81,7 @@ sub insert_comment { warn Dumper( $cmt ) unless $cmt->{jitemid}; my $jitem = LJ::Entry->new( $u, jitemid => $cmt->{jitemid} ); - my $source = $jitem->prop( "import_source" ) . "?thread=" . ( $cmt->{id} << 8 ); + my $source = ( $cmt->{entry_source} || $jitem->prop( "import_source" ) ) . "?thread=" . ( $cmt->{id} << 8 ); my $user = LJ::load_userid( $cmt->{posterid} ) if $cmt->{posterid}; diff -r 2036b42ff459 -r 3a86879e954b cgi-bin/DW/Worker/ContentImporter/Local/Entries.pm --- a/cgi-bin/DW/Worker/ContentImporter/Local/Entries.pm Tue Aug 11 03:15:05 2009 +0000 +++ b/cgi-bin/DW/Worker/ContentImporter/Local/Entries.pm Tue Aug 11 05:17:08 2009 +0000 @@ -18,6 +18,8 @@ package DW::Worker::ContentImporter::Loc package DW::Worker::ContentImporter::Local::Entries; use strict; +use Carp qw/ croak /; + =head1 NAME DW::Worker::ContentImporter::Local::Entries - Local data utilities for entries @@ -35,19 +37,21 @@ sub get_entry_map { sub get_entry_map { my ( $class, $u ) = @_; - my $p = LJ::get_prop( "log", "import_source" ); - return {} unless $p; - - my $dbr = LJ::get_cluster_reader( $u ); - my %map; - my $sth = $dbr->prepare( "SELECT jitemid, value FROM logprop2 WHERE journalid = ? AND propid = ?" ); + my $p = LJ::get_prop( log => 'import_source' ) + or croak 'unable to load logprop'; + my $dbcr = LJ::get_cluster_reader( $u ) + or croak 'unable to connect to database'; + my $sth = $dbcr->prepare( "SELECT jitemid, value FROM logprop2 WHERE journalid = ? AND propid = ?" ) + or croak 'unable to prepare SQL'; $sth->execute( $u->id, $p->{id} ); + croak 'database error: ' . $sth->errstr + if $sth->err; + my %map; while ( my ( $jitemid, $value ) = $sth->fetchrow_array ) { $map{$value} = $jitemid; } - return \%map; } --------------------------------------------------------------------------------
no subject
no subject