[dw-free] Allow importer to correctly import from multiple sources.
[commit: http://hg.dwscoalition.org/dw-free/rev/7d8ff1c41e7c]
http://bugs.dwscoalition.org/show_bug.cgi?id=596
Same approach as last time - look for the username + hostname to match URLs.
Also do some cleanup and remove unused code.
Patch by
mark.
Files modified:
http://bugs.dwscoalition.org/show_bug.cgi?id=596
Same approach as last time - look for the username + hostname to match URLs.
Also do some cleanup and remove unused code.
Patch by
![[staff profile]](https://www.dreamwidth.org/img/silk/identity/user_staff.png)
Files modified:
- cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm
- cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm
- cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm
-------------------------------------------------------------------------------- diff -r 5135c626cf7f -r 7d8ff1c41e7c cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm --- a/cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm Sat Apr 25 18:30:49 2009 +0000 +++ b/cgi-bin/DW/Worker/ContentImporter/LiveJournal.pm Sat Apr 25 20:45:19 2009 +0000 @@ -30,170 +30,6 @@ use Digest::MD5 qw/ md5_hex /; # storage for import related stuff our %MAPS; - -# TODO: Expire session when we're done with it. -sub xx_work { - my ( $class, $job ) = @_; - my $opts = $job->arg; - - # predeclare a lot of variables we use in several places - my ( $u, $r, $ua ); - - # prepare $opts - $opts->{_rl_requests} ||= 3; - $opts->{_rl_seconds} ||= 1; - $opts->{errors} ||= []; - $opts->{md5password} ||= md5_hex( delete $opts->{password} ); - - # failure closer for permanent errors - my $fail = sub { - $u->set_prop( import_job => '' ) if $u; - $job->permanent_failure( sprintf( shift(), @_ ) ); - return; - }; - - # failure closer for permanent errors - my $temp_fail = sub { - $job->failed( sprintf( shift(), @_ ) ); - return; - }; - - # return a success, do proper cleanup - my $ok = sub { - $u->set_prop( import_job => '' ) if $u; - $job->completed; - return; - }; - - # general purpose error checking - return $fail->( "No server provided" ) unless $opts->{server}; - return $fail->( "No username provided" ) unless $opts->{user}; - return $fail->( "No password provided" ) unless $opts->{md5password}; - - # variable setup - $u = LJ::load_userid( $opts->{target} ) - or return $fail->( 'Unable to load target with id %d.', $opts->{target} ); - - # we will use this a lot later - $ua = LJ::get_useragent( - role => 'importer', - max_size => 524288, # half meg, this should be plenty - timeout => 20, # 20 seconds, might need tuning for slow sites - ); - - # get proper form of LJ URL, improper URL will cause issues later with export_comments.bml - unless ( $opts->{canonicalized} ) { - $opts->{server} = lc( $opts->{server} ); - $opts->{user} = lc( $opts->{user} ); - - # textual form, or something weird - $opts->{server} =~ s/^.* at //; - $opts->{server} =~ s/ dot /./; - - # in case the user put a full URL in, instead of a host - $opts->{server} =~ s!https?://!!; - $opts->{server} =~ s!^.*?@!!; - $opts->{server} =~ s!/.*$!!; - - # or their journal URL, underlines are dashes - $opts->{user} =~ s/_/-/g; - $opts->{server} =~ s/_/-/g; - $opts->{server} =~ s/$opts->{user}/www/; - - $opts->{self_user} = $u->username; - $opts->{self_user} =~ s/_/-/g; - - DW::Worker::ContentImporter->ratelimit_request( $opts ); - $r = $ua->get( "http://$opts->{server}/" ); - $opts->{server} = $r->request->uri->host if $r->is_success; - $opts->{canonicalized} = 1; - $opts->{bad} = 1 unless $r->is_success; - } - - # get a session, will use it at some point - DW::Worker::ContentImporter->ratelimit_request( $opts ); - $r = call_xmlrpc( $opts, 'sessiongenerate', { expiration => 'short' } ) unless $opts->{bad}; - if ( $opts->{bad} || !$r || $r->{fault} ) { - LJ::send_mail( { - to => $u->email_raw, - from => $LJ::BOGUS_EMAIL, - body => <<EOF -Dear $u->{user}, - -We encountered an error trying to import your journal $opts->{user} at $opts->{server}. - -Please verify that you gave us your correct username, password or site, and try again. - -Regards, -The $LJ::SITENAME Team -EOF - } ); - return $fail->( "Invalid username/password." ); - } - - # local variable for later - my $session = $r->{ljsession}; - - # this is the best I can think of to get the user's path prefix from the site. - unless ( $opts->{user_path} ) { - DW::Worker::ContentImporter->ratelimit_request( $opts ); - $r = $ua->get( "http://$opts->{server}/~$opts->{user}" ); - return $temp_fail->( 'Could not get proper URL.' ) unless $r->is_success; - my $url = $r->request->uri->as_string; - $url =~ s/\/$//; - $opts->{user_path} = $url; - } - - # looks good, everything seems to be in order. let's fire off a few - # new jobs to do the work of the various import tasks. we do this in - # separate jobs in case of failure in one particular mode, and so we - # can rate limit things separately. - my $sh = LJ::theschwartz() - or return $temp_fail->( 'failed to get TheSchwartz handle' ); - -# TODO(mark): add Userpics Comments Entries Bio Friends - foreach my $func ( qw/ Tags Friends / ) { - my $job = TheSchwartz::Job->new( - funcname => "DW::Worker::ContentImporter::LiveJournal::$func", - uniqkey => "import-" . $u->id . "-" . lc( $func ), - arg => $opts, - ) or return $temp_fail->( 'failed to create job' ); - - $sh->insert( $job ) - or return $temp_fail->( "unable to insert $func job" ); - } - - return $ok->(); - - - my $email = <<EOF; -Dear $u->{user}, - -Your journal $opts->{user} at $opts->{server} has been imported. - -EOF - if ( $opts->{userpics_later} ) { - $email .= "\n\nNot all of your userpictures have been imported. Please go to (site) to decide which you wish to import.\n"; - } - if ( scalar @{$opts->{errors}} ) { - $email .= "\n\nHowever, we were unfortunately unable to import the following items, and you will have to do them manually:\n"; - foreach my $item ( @{$opts->{errors}} ) { - $email .= " * $item\n"; - } - } - $email .= <<EOF; - -Regards, -The $LJ::SITENAME Team -EOF - LJ::send_mail( { - to => $u->email_raw, - from => $LJ::BOGUS_EMAIL, - body => $email - } ); - $u->set_prop( "import_job", '' ); - $job->completed; -} sub keep_exit_status_for { 0 } sub grab_for { 600 } diff -r 5135c626cf7f -r 7d8ff1c41e7c cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm --- a/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm Sat Apr 25 18:30:49 2009 +0000 +++ b/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Comments.pm Sat Apr 25 20:45:19 2009 +0000 @@ -99,7 +99,9 @@ sub try_work { my $jitemid_map = {}; foreach my $url ( keys %$entry_map ) { # this works, see the Entries importer for more information - next unless $url =~ /\Q$data->{hostname}\E/; + $url =~ s/-/_/g; # makes \b work below + next unless $url =~ /\Q$data->{hostname}\E/ && + $url =~ /\b$data->{username}\b/; my $jitemid = $1 >> 8 if $url =~ m!/(\d+)\.html$!; diff -r 5135c626cf7f -r 7d8ff1c41e7c cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm --- a/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm Sat Apr 25 18:30:49 2009 +0000 +++ b/cgi-bin/DW/Worker/ContentImporter/LiveJournal/Entries.pm Sat Apr 25 20:45:19 2009 +0000 @@ -141,7 +141,9 @@ sub try_work { # from. this assumes URLs never have other hostnames, so if someone were to # register testlivejournal.com and do an import, they will have trouble # importing. if they want to do that to befunge this logic, more power to them. - next unless $url =~ /\Q$data->{hostname}\E/; + $url =~ s/-/_/g; # makes \b work below + next unless $url =~ /\Q$data->{hostname}\E/ && + $url =~ /\b$data->{username}\b/; unless ( $url =~ m!/(\d+)\.html$! ) { $log->( 'URL %s not of expected format in prune.', $url ); --------------------------------------------------------------------------------