mark: A photo of Mark kneeling on top of the Taal Volcano in the Philippines. It was a long hike. (Default)
Mark Smith ([staff profile] mark) wrote in [site community profile] changelog2009-04-25 08:45 pm

[dw-free] Allow importer to correctly import from multiple sources.

[commit: http://hg.dwscoalition.org/dw-free/rev/7d8ff1c41e7c]

http://bugs.dwscoalition.org/show_bug.cgi?id=596

Same approach as last time - look for the username + hostname to match URLs.
Also do some cleanup and remove unused code.

Patch by [staff profile] mark.

Files modified:
  • cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm
  • cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm
  • cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm
--------------------------------------------------------------------------------
diff -r 5135c626cf7f -r 7d8ff1c41e7c cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm
--- a/cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm	Sat Apr 25 18:30:49 2009 +0000
+++ b/cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm	Sat Apr 25 20:45:19 2009 +0000
@@ -30,170 +30,6 @@ use Digest::MD5 qw/ md5_hex /;
 
 # storage for import related stuff
 our %MAPS;
-
-# TODO: Expire session when we're done with it.
-sub xx_work {
-    my ( $class, $job ) = @_;
-    my $opts = $job->arg;
-
-    # predeclare a lot of variables we use in several places
-    my ( $u, $r, $ua );
-
-    # prepare $opts
-    $opts->{_rl_requests} ||= 3;
-    $opts->{_rl_seconds}  ||= 1;
-    $opts->{errors}       ||= [];
-    $opts->{md5password}  ||= md5_hex( delete $opts->{password} );
-
-    # failure closer for permanent errors
-    my $fail = sub {
-        $u->set_prop( import_job => '' ) if $u;
-        $job->permanent_failure( sprintf( shift(), @_ ) );
-        return;
-    };
-
-    # failure closer for permanent errors
-    my $temp_fail = sub {
-        $job->failed( sprintf( shift(), @_ ) );
-        return;
-    };
-
-    # return a success, do proper cleanup
-    my $ok = sub {
-        $u->set_prop( import_job => '' ) if $u;
-        $job->completed;
-        return;
-    };
-
-    # general purpose error checking
-    return $fail->( "No server provided" ) unless $opts->{server};
-    return $fail->( "No username provided" ) unless $opts->{user};
-    return $fail->( "No password provided" ) unless $opts->{md5password};
-
-    # variable setup
-    $u = LJ::load_userid( $opts->{target} )
-        or return $fail->( 'Unable to load target with id %d.', $opts->{target} );
-
-    # we will use this a lot later
-    $ua = LJ::get_useragent(
-        role     => 'importer',
-        max_size => 524288, # half meg, this should be plenty
-        timeout  => 20,     # 20 seconds, might need tuning for slow sites
-    );
-
-    # get proper form of LJ URL, improper URL will cause issues later with export_comments.bml
-    unless ( $opts->{canonicalized} ) {
-        $opts->{server} = lc( $opts->{server} );
-        $opts->{user} = lc( $opts->{user} );
-
-        # textual form, or something weird
-        $opts->{server} =~ s/^.* at //;
-        $opts->{server} =~ s/ dot /./;
-
-        # in case the user put a full URL in, instead of a host
-        $opts->{server} =~ s!https?://!!;
-        $opts->{server} =~ s!^.*?@!!;
-        $opts->{server} =~ s!/.*$!!;
-
-        # or their journal URL, underlines are dashes
-        $opts->{user} =~ s/_/-/g;
-        $opts->{server} =~ s/_/-/g;
-        $opts->{server} =~ s/$opts->{user}/www/;
-
-        $opts->{self_user} = $u->username;
-        $opts->{self_user} =~ s/_/-/g;
-
-        DW::Worker::ContentImporter->ratelimit_request( $opts );
-        $r = $ua->get( "http://$opts->{server}/" );
-        $opts->{server} = $r->request->uri->host if $r->is_success;
-        $opts->{canonicalized} = 1;
-        $opts->{bad} = 1 unless $r->is_success;
-    }
-
-    # get a session, will use it at some point
-    DW::Worker::ContentImporter->ratelimit_request( $opts );
-    $r = call_xmlrpc( $opts, 'sessiongenerate', { expiration => 'short' } ) unless $opts->{bad};
-    if ( $opts->{bad} || !$r || $r->{fault} ) {
-            LJ::send_mail( {
-                    to => $u->email_raw,
-                    from => $LJ::BOGUS_EMAIL,
-                    body => <<EOF
-Dear $u->{user},
-
-We encountered an error trying to import your journal $opts->{user} at $opts->{server}.
-
-Please verify that you gave us your correct username, password or site, and try again.
-
-Regards,
-The $LJ::SITENAME Team
-EOF
-                } );
-        return $fail->( "Invalid username/password." );
-    }
-
-    # local variable for later
-    my $session = $r->{ljsession};
-
-    # this is the best I can think of to get the user's path prefix from the site.
-    unless ( $opts->{user_path} ) {
-        DW::Worker::ContentImporter->ratelimit_request( $opts );
-        $r = $ua->get( "http://$opts->{server}/~$opts->{user}" );
-        return $temp_fail->( 'Could not get proper URL.' ) unless $r->is_success;
-        my $url = $r->request->uri->as_string;
-        $url =~ s/\/$//;
-        $opts->{user_path} = $url;
-    }
-
-    # looks good, everything seems to be in order. let's fire off a few
-    # new jobs to do the work of the various import tasks. we do this in
-    # separate jobs in case of failure in one particular mode, and so we
-    # can rate limit things separately.
-    my $sh = LJ::theschwartz()
-        or return $temp_fail->( 'failed to get TheSchwartz handle' );
-
-# TODO(mark): add Userpics Comments Entries Bio Friends
-    foreach my $func ( qw/ Tags Friends / ) {
-        my $job = TheSchwartz::Job->new(
-            funcname => "DW::Worker::ContentImporter::LiveJournal::$func",
-            uniqkey => "import-" . $u->id . "-" . lc( $func ),
-            arg => $opts,
-        ) or return $temp_fail->( 'failed to create job' );
-
-        $sh->insert( $job )
-            or return $temp_fail->( "unable to insert $func job" );
-    }
-
-    return $ok->();
-
-
-    my $email = <<EOF;
-Dear $u->{user},
-
-Your journal $opts->{user} at $opts->{server} has been imported.
-
-EOF
-    if ( $opts->{userpics_later} ) {
-        $email .= "\n\nNot all of your userpictures have been imported. Please go to (site) to decide which you wish to import.\n";
-    }
-    if ( scalar @{$opts->{errors}} ) {
-        $email .= "\n\nHowever, we were unfortunately unable to import the following items, and you will have to do them manually:\n";
-        foreach my $item ( @{$opts->{errors}} ) {
-            $email .= " * $item\n";
-        }
-    }
-    $email .= <<EOF;
-
-Regards,
-The $LJ::SITENAME Team
-EOF
-    LJ::send_mail( {
-            to => $u->email_raw,
-            from => $LJ::BOGUS_EMAIL,
-            body => $email
-        } );
-    $u->set_prop( "import_job", '' );
-    $job->completed;
-}
 
 sub keep_exit_status_for { 0 }
 sub grab_for { 600 }
diff -r 5135c626cf7f -r 7d8ff1c41e7c cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm
--- a/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm	Sat Apr 25 18:30:49 2009 +0000
+++ b/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm	Sat Apr 25 20:45:19 2009 +0000
@@ -99,7 +99,9 @@ sub try_work {
     my $jitemid_map = {};
     foreach my $url ( keys %$entry_map ) {
         # this works, see the Entries importer for more information
-        next unless $url =~ /\Q$data->{hostname}\E/;
+        $url =~ s/-/_/g; # makes \b work below
+        next unless $url =~ /\Q$data->{hostname}\E/ &&
+                    $url =~ /\b$data->{username}\b/;
 
         my $jitemid = $1 >> 8
             if $url =~ m!/(\d+)\.html$!;
diff -r 5135c626cf7f -r 7d8ff1c41e7c cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm
--- a/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm	Sat Apr 25 18:30:49 2009 +0000
+++ b/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm	Sat Apr 25 20:45:19 2009 +0000
@@ -141,7 +141,9 @@ sub try_work {
         # from.  this assumes URLs never have other hostnames, so if someone were to
         # register testlivejournal.com and do an import, they will have trouble
         # importing.  if they want to do that to befunge this logic, more power to them.
-        next unless $url =~ /\Q$data->{hostname}\E/;
+        $url =~ s/-/_/g; # makes \b work below
+        next unless $url =~ /\Q$data->{hostname}\E/ &&
+                    $url =~ /\b$data->{username}\b/;
 
         unless ( $url =~ m!/(\d+)\.html$! ) {
             $log->( 'URL %s not of expected format in prune.', $url );
--------------------------------------------------------------------------------

Post a comment in response:

This account has disabled anonymous posting.
If you don't have an account you can create one now.
HTML doesn't work in the subject.
More info about formatting

If you are unable to use this captcha for any reason, please contact us by email at support@dreamwidth.org