[dw-free] Recognize authors in different formats in the feeds we pull in
[commit: http://hg.dwscoalition.org/dw-free/rev/0e2ad8ec3049]
http://bugs.dwscoalition.org/show_bug.cgi?id=2257
Recognize the author tag in atom feeds, and dc:creator in RSS feeds. Remove
some redundancy when processing the dc: namespace.
Patch by
kareila.
Files modified:
http://bugs.dwscoalition.org/show_bug.cgi?id=2257
Recognize the author tag in atom feeds, and dc:creator in RSS feeds. Remove
some redundancy when processing the dc: namespace.
Patch by
Files modified:
- cgi-bin/LJ/ParseFeed.pm
- cgi-bin/LJ/SynSuck.pm
- t/parsefeed-authors.t
--------------------------------------------------------------------------------
diff -r 543e7334dcbf -r 0e2ad8ec3049 cgi-bin/LJ/ParseFeed.pm
--- a/cgi-bin/LJ/ParseFeed.pm Thu Mar 03 14:15:44 2011 +0800
+++ b/cgi-bin/LJ/ParseFeed.pm Mon Mar 07 15:21:06 2011 +0800
@@ -79,6 +79,11 @@ sub parse_feed
# try parsing it as RSS
$parser = new XML::RSS;
return ("", "failed to create RSS parser") unless $parser;
+
+ # custom LJ/DW namespaces
+ $parser->add_module( prefix => 'nslj',
+ uri => 'http://www.livejournal.org/rss/lj/1.0/' );
+
eval {
$parser->parse($content);
};
@@ -105,7 +110,6 @@ sub parse_feed
$item->{'link'} = $_->{'link'} if $_->{'link'};
$item->{'id'} = $_->{'guid'} if $_->{'guid'};
- my $nsdc = 'http://purl.org/dc/elements/1.1/';
my $nsenc = 'http://purl.org/rss/1.0/modules/content/';
if ($_->{$nsenc} && ref($_->{$nsenc}) eq "HASH") {
# prefer content:encoded if present
@@ -113,21 +117,19 @@ sub parse_feed
if defined $_->{$nsenc}->{'encoded'};
}
- my ( $time, $dc_ref );
+ my ( $time, $author );
$time = time822_to_time( $_->{pubDate} ) if $_->{pubDate};
+ $author = $_->{nslj}->{poster}
+ if $_->{nslj} && ref $_->{nslj} eq "HASH";
- if ( $_->{$nsdc} && ref $_->{$nsdc} eq "HASH" ) {
- $dc_ref = $_->{$nsdc};
- } elsif ( $_->{dc} && ref $_->{dc} eq "HASH" ) {
- $dc_ref = $_->{dc};
- }
-
- if ( defined $dc_ref ) {
- $item->{author} = $dc_ref->{creator} if $dc_ref->{creator};
- $time = w3cdtf_to_time( $dc_ref->{date} ) if $dc_ref->{date};
+ # Dublin Core
+ if ( $_->{dc} && ref $_->{dc} eq "HASH" ) {
+ $author = $_->{dc}->{creator} if $_->{dc}->{creator};
+ $time = w3cdtf_to_time( $_->{dc}->{date} ) if $_->{dc}->{date};
}
$item->{time} = $time if $time;
+ $item->{author} = $author if $author;
push @{ $feed->{items} }, $item;
}
@@ -340,6 +342,31 @@ sub StartTag {
last TAGS;
}
+ # we want to store the value of the nested <name> element
+ # in the author slot, not accumulate the raw value -
+ # use temp key "inauth" to detect the nesting
+
+ if ( $tag eq 'author' ) {
+ $holder->{inauth} = 1;
+ last TAGS;
+ }
+
+ if ( $tag eq 'name' ) {
+ if ( $holder->{inauth} ) {
+ startaccum( 'author' );
+ } else {
+ swallow();
+ }
+ last TAGS;
+ }
+
+ if ( $tag eq 'poster' ) {
+ $holder->{ljposter} = $_{user};
+ return err( "No user attribute in <$tag>" )
+ unless $holder->{ljposter};
+ last TAGS;
+ }
+
# store tags which should require no further
# processing as they are, and others under _atom_*, to be processed
# in EndTag under </entry>
@@ -465,12 +492,14 @@ sub EndTag {
$time = "$1-$2-$3 $4:$5";
}
}
- if ($time) {
- $item->{'time'} = $time;
- }
-
+ $item->{time} = $time if $time;
+
+ # if we found ljposter, use that as preferred author
+ $item->{author} = $item->{ljposter} if defined $item->{ljposter};
+ delete $item->{ljposter};
+
# get rid of all other tags we don't need anymore
- foreach (keys %$item) {
+ foreach ( keys %$item ) {
delete $item->{$_} if substr($_, 0, 6) eq '_atom_';
}
@@ -478,10 +507,23 @@ sub EndTag {
undef $item;
last TAGS;
}
+
+ if ( $tag eq 'author' ) {
+ my $holder = $item ? $item : $feed;
+ delete $holder->{inauth};
+ last TAGS;
+ }
+
if ($tag eq 'feed') {
# finalize feed
+
+ # if feed author exists, all items should default to it
+ if ( defined $feed->{author} ) {
+ $_->{author} ||= $feed->{author} foreach @items;
+ }
+
# get rid of all other tags we don't need anymore
- foreach (keys %$feed) {
+ foreach ( keys %$feed ) {
delete $feed->{$_} if substr($_, 0, 6) eq '_atom_';
}
diff -r 543e7334dcbf -r 0e2ad8ec3049 cgi-bin/LJ/SynSuck.pm
--- a/cgi-bin/LJ/SynSuck.pm Thu Mar 03 14:15:44 2011 +0800
+++ b/cgi-bin/LJ/SynSuck.pm Mon Mar 07 15:21:06 2011 +0800
@@ -304,7 +304,7 @@ sub process_content {
foreach my $it (@items) {
# remove the SvUTF8 flag. it's still UTF-8, but
- # we don't want perl knowing that and fucking stuff up
+ # we don't want perl knowing that and messing stuff up
# for us behind our back in random places all over
# http://zilla.livejournal.org/show_bug.cgi?id=1037
foreach my $attr (qw(id subject text link author)) {
diff -r 543e7334dcbf -r 0e2ad8ec3049 t/parsefeed-authors.t
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/t/parsefeed-authors.t Mon Mar 07 15:21:06 2011 +0800
@@ -0,0 +1,156 @@
+# -*-perl-*-
+use strict;
+use Test::More tests => 10;
+
+use lib "$ENV{LJHOME}/cgi-bin";
+require 'ljlib.pl';
+
+use LJ::ParseFeed;
+
+my $feed_rss = q {
+<rss version='2.0'
+ xmlns:lj='http://www.livejournal.org/rss/lj/1.0/'
+ xmlns:dw='http://www.livejournal.org/rss/lj/1.0/'
+ xmlns:atom10='http://www.w3.org/2005/Atom'
+ xmlns:dc='http://purl.org/dc/elements/1.1/'>
+<channel>
+ <title>Title</title>
+ <link>http://examplecomm.dream.fu/</link>
+ <description>Title - Dreamwidth Studios</description>
+ <lastBuildDate>Thu, 03 Feb 2011 17:00:43 GMT</lastBuildDate>
+ <generator>LiveJournal / Dreamwidth Studios</generator>
+ <lj:journal>examplecomm</lj:journal>
+ <lj:journaltype>community</lj:journaltype>
+ <atom10:link rel='self' href='http://examplecomm.dream.fu/data/rss' />
+ <image>
+ <url>http://www.dream.fu/userpic/1/2</url>
+ <title>Title</title>
+ <link>http://examplecomm.dream.fu/</link>
+ <width>100</width>
+ <height>100</height>
+ </image>
+
+<item>
+ <guid isPermaLink='true'>http://examplecomm.dream.fu/12345.html</guid>
+ <pubDate>Thu, 03 Feb 2011 17:00:43 GMT</pubDate>
+ <title>yo</title>
+ <link>http://examplecomm.dream.fu/12345.html</link>
+ <description>yo</description>
+ <comments>http://examplecomm.dream.fu/12345.html</comments>
+ <dc:creator>example-dc-creator</dc:creator>
+</item>
+
+<item>
+ <guid isPermaLink='true'>http://examplecomm.dream.fu/123.html</guid>
+ <pubDate>>Wed, 24 Nov 2010 06:52:33 GMT</pubDate>
+ <title>yo</title>
+ <link>http://examplecomm.dream.fu/123.html</link>
+ <description>yo</description>
+ <comments>http://examplecomm.dream.fu/123.html</comments>
+ <lj:poster>example-lj-poster</lj:poster>
+ <lj:security>public</lj:security>
+ <lj:reply-count>0</lj:reply-count>
+</item>
+
+<item>
+ <guid isPermaLink='true'>http://examplecomm.dream.fu/456.html</guid>
+ <pubDate>>Wed, 24 Jun 2010 06:52:33 GMT</pubDate>
+ <title>yo</title>
+ <link>http://examplecomm.dream.fu/456.html</link>
+ <description>yo</description>
+ <comments>http://examplecomm.dream.fu/456.html</comments>
+ <dw:poster>example-dw-poster</dw:poster>
+ <dw:security>public</dw:security>
+ <dw:reply-count>0</dw:reply-count>
+</item>
+
+</channel>
+</rss>};
+
+my $feed_atom = q {<?xml version="1.0" encoding="UTF-8"?>
+<feed xmlns="http://www.w3.org/2005/Atom" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dw="http://www.dreamwidth.org" xmlns:lj="http://www.livejournal.com">
+ <title>Feed title</title>
+ <link rel="alternate" type="text/html" href="http://example.com/feed/atom" />
+ <id>example:atom:feed</id>
+ <updated>2011-01-23T17:38:49-08:00</updated>
+ <author>
+ <name>example-feed-author</name>
+ </author>
+
+ <entry>
+ <title>Item 1</title>
+ <link rel="alternate" type="text/html" href="http://example.com/feed/atom/1" />
+ <id>1</id>
+ <published>2011-01-23T13:58:08-08:00</published>
+ <updated>2011-01-23T13:58:08-08:00</updated>
+ <author>
+ <name>example-atom-author</name>
+ </author>
+ <content type="html">foo</content>
+ </entry>
+
+ <entry>
+ <title>Item 2</title>
+ <link rel="alternate" type="text/html" href="http://example.com/feed/atom/2" />
+ <id>2</id>
+ <published>2011-01-23T13:59:55-08:00</published>
+ <updated>2011-01-23T13:59:55-08:00</updated>
+ <dw:poster user="example-dw-poster"/>
+ <content type="html">bar</content>
+ </entry>
+
+ <entry>
+ <title>Item 3</title>
+ <link rel="alternate" type="text/html" href="http://example.com/feed/atom/3" />
+ <id>3</id>
+ <published>2011-01-23T17:38:49-08:00</published>
+ <updated>2011-01-23T17:38:49-08:00</updated>
+ <lj:poster user="example-lj-poster"/>
+ <content type="html">baz</content>
+ </entry>
+
+ <entry>
+ <title>Item 4</title>
+ <link rel="alternate" type="text/html" href="http://example.com/feed/atom/4" />
+ <id>4</id>
+ <published>2011-01-23T18:38:49-08:00</published>
+ <updated>2011-01-23T18:38:49-08:00</updated>
+ <content type="html">quux</content>
+ </entry>
+
+ <entry>
+ <title>Item 5</title>
+ <link rel="alternate" type="text/html" href="http://example.com/feed/atom/5" />
+ <id>5</id>
+ <published>2011-01-23T19:38:49-08:00</published>
+ <updated>2011-01-23T19:38:49-08:00</updated>
+ <lj:poster user="prefer-lj-poster"/>
+ <author>
+ <name>bogus-atom-author</name>
+ </author>
+ <content type="html">blech</content>
+ </entry>
+
+</feed>};
+
+my ( $parse_rss, $rss_error ) = LJ::ParseFeed::parse_feed( $feed_rss, "rss" );
+is( $rss_error, undef, "RSS parse OK" );
+
+SKIP: {
+ skip "RSS parse failed", 3 if $rss_error;
+ is( $parse_rss->{items}->[0]->{author}, "example-dc-creator", "<dc:creator> tag" );
+ is( $parse_rss->{items}->[1]->{author}, "example-lj-poster", "<lj:poster> tag" );
+ is( $parse_rss->{items}->[2]->{author}, "example-dw-poster", "<dw:poster> tag" );
+}
+
+my ( $parse_atom, $atom_error ) = LJ::ParseFeed::parse_feed( $feed_atom, "atom" );
+is( $atom_error, undef, "Atom parse OK" );
+
+SKIP: {
+ skip "Atom parse failed", 5 if $atom_error;
+ is( $parse_atom->{items}->[0]->{author}, "example-atom-author", "item <author> tag" );
+ is( $parse_atom->{items}->[1]->{author}, "example-dw-poster", "<dw:poster> tag" );
+ is( $parse_atom->{items}->[2]->{author}, "example-lj-poster", "<lj:poster> tag" );
+ is( $parse_atom->{items}->[3]->{author}, "example-feed-author", "feed <author> tag" );
+ is( $parse_atom->{items}->[4]->{author}, "prefer-lj-poster", "both <lj:poster> and <author> tags" );
+}
--------------------------------------------------------------------------------
