fu: Close-up of Fu, bringing a scoop of water to her mouth (Default)
fu ([personal profile] fu) wrote in [site community profile] changelog2011-03-07 07:22 am

[dw-free] Recognize authors in different formats in the feeds we pull in

[commit: http://hg.dwscoalition.org/dw-free/rev/0e2ad8ec3049]

http://bugs.dwscoalition.org/show_bug.cgi?id=2257

Recognize the author tag in atom feeds, and dc:creator in RSS feeds. Remove
some redundancy when processing the dc: namespace.

Patch by [personal profile] kareila.

Files modified:
  • cgi-bin/LJ/ParseFeed.pm
  • cgi-bin/LJ/SynSuck.pm
  • t/parsefeed-authors.t
--------------------------------------------------------------------------------
diff -r 543e7334dcbf -r 0e2ad8ec3049 cgi-bin/LJ/ParseFeed.pm
--- a/cgi-bin/LJ/ParseFeed.pm	Thu Mar 03 14:15:44 2011 +0800
+++ b/cgi-bin/LJ/ParseFeed.pm	Mon Mar 07 15:21:06 2011 +0800
@@ -79,6 +79,11 @@ sub parse_feed
     # try parsing it as RSS
     $parser = new XML::RSS;
     return ("", "failed to create RSS parser") unless $parser;
+
+    # custom LJ/DW namespaces
+    $parser->add_module( prefix => 'nslj',
+                         uri => 'http://www.livejournal.org/rss/lj/1.0/' );
+
     eval {
         $parser->parse($content);
     };
@@ -105,7 +110,6 @@ sub parse_feed
         $item->{'link'} = $_->{'link'} if $_->{'link'};
         $item->{'id'} = $_->{'guid'} if $_->{'guid'};
 
-        my $nsdc = 'http://purl.org/dc/elements/1.1/';
         my $nsenc = 'http://purl.org/rss/1.0/modules/content/';
         if ($_->{$nsenc} && ref($_->{$nsenc}) eq "HASH") {
             # prefer content:encoded if present
@@ -113,21 +117,19 @@ sub parse_feed
                 if defined $_->{$nsenc}->{'encoded'};
         }
 
-        my ( $time, $dc_ref );
+        my ( $time, $author );
         $time = time822_to_time( $_->{pubDate} ) if $_->{pubDate};
+        $author = $_->{nslj}->{poster}
+            if $_->{nslj} && ref $_->{nslj} eq "HASH";
 
-        if ( $_->{$nsdc} && ref $_->{$nsdc} eq "HASH" ) {
-            $dc_ref = $_->{$nsdc};
-        } elsif ( $_->{dc} && ref $_->{dc} eq "HASH" ) {
-            $dc_ref = $_->{dc};
-        }
-
-        if ( defined $dc_ref ) {
-            $item->{author} = $dc_ref->{creator} if $dc_ref->{creator};
-            $time = w3cdtf_to_time( $dc_ref->{date} ) if $dc_ref->{date};
+        # Dublin Core
+        if ( $_->{dc} && ref $_->{dc} eq "HASH" ) {
+            $author = $_->{dc}->{creator} if $_->{dc}->{creator};
+            $time = w3cdtf_to_time( $_->{dc}->{date} ) if $_->{dc}->{date};
         }
 
         $item->{time} = $time if $time;
+        $item->{author} = $author if $author;
         push @{ $feed->{items} }, $item;
     }
 
@@ -340,6 +342,31 @@ sub StartTag {
             last TAGS;
         }
 
+        # we want to store the value of the nested <name> element
+        # in the author slot, not accumulate the raw value -
+        # use temp key "inauth" to detect the nesting
+
+        if ( $tag eq 'author' ) {
+            $holder->{inauth} = 1;
+            last TAGS;
+        }
+
+        if ( $tag eq 'name' ) {
+            if ( $holder->{inauth} ) {
+                startaccum( 'author' );
+            } else {
+                swallow();
+            }
+            last TAGS;
+        }
+
+        if ( $tag eq 'poster' ) {
+            $holder->{ljposter} = $_{user};
+            return err( "No user attribute in <$tag>" )
+                unless $holder->{ljposter};
+            last TAGS;
+        }
+
         # store tags which should require no further
         # processing as they are, and others under _atom_*, to be processed
         # in EndTag under </entry>
@@ -465,12 +492,14 @@ sub EndTag {
                     $time = "$1-$2-$3 $4:$5";
                 }
             }
-            if ($time) {
-                $item->{'time'} = $time;
-            }
-            
+            $item->{time} = $time if $time;
+
+            # if we found ljposter, use that as preferred author
+            $item->{author} = $item->{ljposter} if defined $item->{ljposter};
+            delete $item->{ljposter};
+
             # get rid of all other tags we don't need anymore
-            foreach (keys  %$item) {
+            foreach ( keys %$item ) {
                 delete $item->{$_} if substr($_, 0, 6) eq '_atom_';
             }
             
@@ -478,10 +507,23 @@ sub EndTag {
             undef $item;
             last TAGS;
         }
+
+        if ( $tag eq 'author' ) {
+            my $holder = $item ? $item : $feed;
+            delete $holder->{inauth};
+            last TAGS;
+        }
+
         if ($tag eq 'feed') {
             # finalize feed
+
+            # if feed author exists, all items should default to it
+            if ( defined $feed->{author} ) {
+                $_->{author} ||= $feed->{author} foreach @items;
+            }
+
             # get rid of all other tags we don't need anymore
-            foreach (keys  %$feed) {
+            foreach ( keys %$feed ) {
                 delete $feed->{$_} if substr($_, 0, 6) eq '_atom_';
             }
             
diff -r 543e7334dcbf -r 0e2ad8ec3049 cgi-bin/LJ/SynSuck.pm
--- a/cgi-bin/LJ/SynSuck.pm	Thu Mar 03 14:15:44 2011 +0800
+++ b/cgi-bin/LJ/SynSuck.pm	Mon Mar 07 15:21:06 2011 +0800
@@ -304,7 +304,7 @@ sub process_content {
     foreach my $it (@items) {
 
         # remove the SvUTF8 flag.  it's still UTF-8, but
-        # we don't want perl knowing that and fucking stuff up
+        # we don't want perl knowing that and messing stuff up
         # for us behind our back in random places all over
         # http://zilla.livejournal.org/show_bug.cgi?id=1037
         foreach my $attr (qw(id subject text link author)) {
diff -r 543e7334dcbf -r 0e2ad8ec3049 t/parsefeed-authors.t
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/t/parsefeed-authors.t	Mon Mar 07 15:21:06 2011 +0800
@@ -0,0 +1,156 @@
+# -*-perl-*-
+use strict;
+use Test::More tests => 10;
+
+use lib "$ENV{LJHOME}/cgi-bin";
+require 'ljlib.pl';
+
+use LJ::ParseFeed;
+
+my $feed_rss = q {
+<rss version='2.0'
+    xmlns:lj='http://www.livejournal.org/rss/lj/1.0/'
+    xmlns:dw='http://www.livejournal.org/rss/lj/1.0/'
+    xmlns:atom10='http://www.w3.org/2005/Atom'
+    xmlns:dc='http://purl.org/dc/elements/1.1/'>
+<channel>
+  <title>Title</title>
+  <link>http://examplecomm.dream.fu/</link>
+  <description>Title - Dreamwidth Studios</description>
+  <lastBuildDate>Thu, 03 Feb 2011 17:00:43 GMT</lastBuildDate>
+  <generator>LiveJournal / Dreamwidth Studios</generator>
+  <lj:journal>examplecomm</lj:journal>
+  <lj:journaltype>community</lj:journaltype>
+  <atom10:link rel='self' href='http://examplecomm.dream.fu/data/rss' />
+  <image>
+    <url>http://www.dream.fu/userpic/1/2</url>
+    <title>Title</title>
+    <link>http://examplecomm.dream.fu/</link>
+    <width>100</width>
+    <height>100</height>
+  </image>
+
+<item>
+  <guid isPermaLink='true'>http://examplecomm.dream.fu/12345.html</guid>
+  <pubDate>Thu, 03 Feb 2011 17:00:43 GMT</pubDate>
+  <title>yo</title>
+  <link>http://examplecomm.dream.fu/12345.html</link>
+  <description>yo</description>
+  <comments>http://examplecomm.dream.fu/12345.html</comments>
+  <dc:creator>example-dc-creator</dc:creator>
+</item>
+
+<item>
+  <guid isPermaLink='true'>http://examplecomm.dream.fu/123.html</guid>
+  <pubDate>>Wed, 24 Nov 2010 06:52:33 GMT</pubDate>
+  <title>yo</title>
+  <link>http://examplecomm.dream.fu/123.html</link>
+  <description>yo</description>
+  <comments>http://examplecomm.dream.fu/123.html</comments>
+  <lj:poster>example-lj-poster</lj:poster>
+  <lj:security>public</lj:security>
+  <lj:reply-count>0</lj:reply-count>
+</item>
+
+<item>
+  <guid isPermaLink='true'>http://examplecomm.dream.fu/456.html</guid>
+  <pubDate>>Wed, 24 Jun 2010 06:52:33 GMT</pubDate>
+  <title>yo</title>
+  <link>http://examplecomm.dream.fu/456.html</link>
+  <description>yo</description>
+  <comments>http://examplecomm.dream.fu/456.html</comments>
+  <dw:poster>example-dw-poster</dw:poster>
+  <dw:security>public</dw:security>
+  <dw:reply-count>0</dw:reply-count>
+</item>
+
+</channel>
+</rss>};
+
+my $feed_atom = q {<?xml version="1.0" encoding="UTF-8"?>
+<feed xmlns="http://www.w3.org/2005/Atom" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dw="http://www.dreamwidth.org" xmlns:lj="http://www.livejournal.com">
+    <title>Feed title</title>
+    <link rel="alternate" type="text/html" href="http://example.com/feed/atom" />
+    <id>example:atom:feed</id>
+    <updated>2011-01-23T17:38:49-08:00</updated>
+    <author>
+        <name>example-feed-author</name>
+    </author>
+
+    <entry>
+        <title>Item 1</title>
+        <link rel="alternate" type="text/html" href="http://example.com/feed/atom/1" />
+        <id>1</id>
+        <published>2011-01-23T13:58:08-08:00</published>
+        <updated>2011-01-23T13:58:08-08:00</updated>
+        <author>
+            <name>example-atom-author</name>
+        </author>
+        <content type="html">foo</content>
+    </entry>
+
+    <entry>
+        <title>Item 2</title>
+        <link rel="alternate" type="text/html" href="http://example.com/feed/atom/2" />
+        <id>2</id>
+        <published>2011-01-23T13:59:55-08:00</published>
+        <updated>2011-01-23T13:59:55-08:00</updated>
+        <dw:poster user="example-dw-poster"/>
+        <content type="html">bar</content>
+    </entry>
+
+    <entry>
+        <title>Item 3</title>
+        <link rel="alternate" type="text/html" href="http://example.com/feed/atom/3" />
+        <id>3</id>
+        <published>2011-01-23T17:38:49-08:00</published>
+        <updated>2011-01-23T17:38:49-08:00</updated>
+        <lj:poster user="example-lj-poster"/>
+        <content type="html">baz</content>
+    </entry>
+
+    <entry>
+        <title>Item 4</title>
+        <link rel="alternate" type="text/html" href="http://example.com/feed/atom/4" />
+        <id>4</id>
+        <published>2011-01-23T18:38:49-08:00</published>
+        <updated>2011-01-23T18:38:49-08:00</updated>
+        <content type="html">quux</content>
+    </entry>
+
+    <entry>
+        <title>Item 5</title>
+        <link rel="alternate" type="text/html" href="http://example.com/feed/atom/5" />
+        <id>5</id>
+        <published>2011-01-23T19:38:49-08:00</published>
+        <updated>2011-01-23T19:38:49-08:00</updated>
+        <lj:poster user="prefer-lj-poster"/>
+        <author>
+            <name>bogus-atom-author</name>
+        </author>
+        <content type="html">blech</content>
+    </entry>
+
+</feed>};
+
+my ( $parse_rss, $rss_error ) = LJ::ParseFeed::parse_feed( $feed_rss, "rss" );
+is( $rss_error, undef, "RSS parse OK" );
+
+SKIP: {
+    skip "RSS parse failed", 3 if $rss_error;
+    is( $parse_rss->{items}->[0]->{author}, "example-dc-creator", "<dc:creator> tag" );
+    is( $parse_rss->{items}->[1]->{author}, "example-lj-poster", "<lj:poster> tag" );
+    is( $parse_rss->{items}->[2]->{author}, "example-dw-poster", "<dw:poster> tag" );
+}
+
+my ( $parse_atom, $atom_error ) = LJ::ParseFeed::parse_feed( $feed_atom, "atom" );
+is( $atom_error, undef, "Atom parse OK" );
+
+SKIP: {
+    skip "Atom parse failed", 5 if $atom_error;
+    is( $parse_atom->{items}->[0]->{author}, "example-atom-author", "item <author> tag" );
+    is( $parse_atom->{items}->[1]->{author}, "example-dw-poster", "<dw:poster> tag" );
+    is( $parse_atom->{items}->[2]->{author}, "example-lj-poster", "<lj:poster> tag" );
+    is( $parse_atom->{items}->[3]->{author}, "example-feed-author", "feed <author> tag" );
+    is( $parse_atom->{items}->[4]->{author}, "prefer-lj-poster", "both <lj:poster> and <author> tags" );
+}
--------------------------------------------------------------------------------