A lax Web news feed parser
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

397 lines
11 KiB

Minimal feed:
input: >
<feed xmlns="http://www.w3.org/2005/Atom"/>
output:
format: atom
version: '1.0'
Atom ID:
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<id>http://example.com/</id>
</feed>
output:
format: atom
version: '1.0'
id: 'http://example.com/'
Atom ID with whitespace:
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<id>
http://example.com/
</id>
</feed>
output:
format: atom
version: '1.0'
id: 'http://example.com/'
Bogus ID before good:
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<id/>
<id>http://example.com/</id>
</feed>
output:
format: atom
version: '1.0'
id: 'http://example.com/'
Feed language:
input: >
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en"/>
output:
format: atom
version: '1.0'
lang: en
Bogus feed language:
input: >
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang=""/>
output:
format: atom
version: '1.0'
Canonical URL:
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<link rel="self"/>
<link rel="self" href="http://example.com/"/>
</feed>
output:
format: atom
version: '1.0'
url: 'http://example.com/'
Feed link 1:
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<link rel="alternate" href="http://example.com/"/>
</feed>
output:
format: atom
version: '1.0'
link: 'http://example.com/'
Feed link 2: # default relation is "alternate"
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<link rel="" href="http://example.com/"/>
</feed>
output:
format: atom
version: '1.0'
link: 'http://example.com/'
Feed link 3: # default relation is "alternate"
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<link href="http://example.com/"/>
</feed>
output:
format: atom
version: '1.0'
link: 'http://example.com/'
Feed link 4: # other relations are ignored
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<link rel="bogus" href="http://example.net/"/>
<link href="http://example.com/"/>
</feed>
output:
format: atom
version: '1.0'
link: 'http://example.com/'
Feed link 5: # XHTML is preferred
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<link href="http://example.net/"/>
<link href="http://example.com/" type="application/xhtml+xml; charset=utf-8"/>
</feed>
output:
format: atom
version: '1.0'
link: 'http://example.com/'
Feed link 6: # HTML is even more preferred
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<link href="http://example.net/"/>
<link href="http://example.org/" type="application/xhtml+xml; charset=utf-8"/>
<link href="http://example.com/" type="TEXT/HTML; charset=utf-8"/>
</feed>
output:
format: atom
version: '1.0'
link: 'http://example.com/'
Feed link 7: # No type is better than an unacceptable type
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<link href="http://example.net/" type="image/png"/>
<link href="http://example.org/" type="application/xml"/>
<link href="http://example.com/"/>
</feed>
output:
format: atom
version: '1.0'
link: 'http://example.com/'
Feed link 8: # Bad URLs are ignored
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<link href="http://example.com/" type="application/xhtml+xml; charset=utf-8"/>
<link href="http://[example.org]/" type="text/html; charset=utf-8"/>
</feed>
output:
format: atom
version: '1.0'
link: 'http://example.com/'
Feed link 9: # The first matching relation wins
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<link href="http://example.com/" type="text/html"/>
<link href="http://example.org/" type="text/html"/>
</feed>
output:
format: atom
version: '1.0'
link: 'http://example.com/'
Relative feed link:
doc_url: 'http://example.com/path/'
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<link href="/"/>
</feed>
output:
format: atom
version: '1.0'
meta:
url: 'http://example.com/path/'
link: ['/', 'http://example.com/path/']
Relative feed link with xml:base:
input: >
<feed xmlns="http://www.w3.org/2005/Atom" xml:base="http://example.com/">
<link href="/" xml:base="path/"/>
</feed>
output:
format: atom
version: '1.0'
link: ['/', 'http://example.com/path/']
Feed title as implied plain text 1:
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Example title</title>
</feed>
output:
format: atom
version: '1.0'
title: 'Example title'
Feed title as implied plain text 2:
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<title type=" ">Example title</title>
</feed>
output:
format: atom
version: '1.0'
title: 'Example title'
Feed title as explicit plain text 1:
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<title type="text">Example title</title>
</feed>
output:
format: atom
version: '1.0'
title: 'Example title'
Feed title as explicit plain text 2:
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<title type="text/plain"> Example title</title>
</feed>
output:
format: atom
version: '1.0'
title: 'Example title'
Feed title as HTML 1:
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<title type="html"> There &amp;amp; Then</title>
</feed>
output:
format: atom
version: '1.0'
title:
html: 'There &amp; Then'
Feed title as HTML 2:
input: >
<feed xmlns="http://www.w3.org/2005/Atom" xml:base="http://example.com/">
<title type="text/html"> There &amp;amp; Then</title>
</feed>
output:
format: atom
version: '1.0'
title:
html: 'There &amp; Then'
htmlBase: 'http://example.com/'
Feed title as XHTML 1:
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<title type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml"> There &amp; Then </div>
</title>
</feed>
output:
format: atom
version: '1.0'
title:
xhtml: '<div xmlns="http://www.w3.org/1999/xhtml"> There &amp; Then </div>'
Feed title as XHTML 2:
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<title type="application/xhtml+xml">
<div xmlns="http://www.w3.org/1999/xhtml" xml:base="http://example.com/"> There &amp; Then </div>
</title>
</feed>
output:
format: atom
version: '1.0'
title:
xhtml: '<div xmlns="http://www.w3.org/1999/xhtml" xml:base="http://example.com/"> There &amp; Then </div>'
xhtmlBase: 'http://example.com/'
Feed title as XHTML 3:
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<title type="application/xhtml+xml">
<div xmlns="http://www.w3.org/1999/xhtml" xml:base="http://example.com/"/>
</title>
</feed>
output:
format: atom
version: '1.0'
title:
xhtml: '<div xmlns="http://www.w3.org/1999/xhtml" xml:base="http://example.com/"/>'
xhtmlBase: 'http://example.com/'
Feed title in multiple formats:
input: >
<feed xmlns="http://www.w3.org/2005/Atom" xml:base="http://example.com/">
<title type="text/plain"> Example title</title>
<title type="html"> There &amp;amp; Then</title>
<title type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml"> There &amp; Then </div>
</title>
</feed>
output:
format: atom
version: '1.0'
title:
plain: 'Example title'
html: 'There &amp; Then'
htmlBase: 'http://example.com/'
xhtml: '<div xmlns="http://www.w3.org/1999/xhtml"> There &amp; Then </div>'
xhtmlBase: 'http://example.com/'
Feed summary 1:
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<subtitle type="html">Subtitle text</subtitle>
<summary type="html">Summary text</summary>
</feed>
output:
format: atom
version: '1.0'
summary:
html: 'Summary text'
Feed summary 2:
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<subtitle type="html">Subtitle text</subtitle>
</feed>
output:
format: atom
version: '1.0'
summary:
html: 'Subtitle text'
Ignored text constructs:
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<title type="text/plain"/>
<title type="text/html"/>
<title type="text/plain"> </title>
<title type="text/html"> </title>
<title type="xhtml"/>
<title type="xhtml"><div/></title>
<title type="xhtml"><body xmlns="http://www.w3.org/1999/xhtml"/></title>
<title type="xhtml"><div>Bogus</div></title>
<title type="xhtml"><body xmlns="http://www.w3.org/1999/xhtml">Bogus</body></title>
<title type="application/octet-stream">RXhhbXBsZSB0aXRsZQo=</title>
<title type="/">RXhhbXBsZSB0aXRsZQo=</title>
<title src="http://example.com/"/>
<title src=""/>
</feed>
output:
format: atom
version: '1.0'
Feed date:
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<updated>2020-03-03T00:00:00Z</updated>
</feed>
output:
format: atom
version: '1.0'
dateModified: '2020-03-03T00:00:00Z'
Multiple feed dates:
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<updated>2020-03-03T00:00:00Z</updated>
<updated>2020-03-04T00:00:00Z</updated>
</feed>
output:
format: atom
version: '1.0'
dateModified: '2020-03-04T00:00:00Z'
Multiple feed date timezones:
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<updated>2020-03-03T00:00:00Z</updated>
<updated>2020-03-03T00:00:00-04:00</updated>
<updated>2020-03-03T01:00:00Z</updated>
</feed>
output:
format: atom
version: '1.0'
dateModified: '2020-03-03T00:00:00-04:00'
Bogus feed date:
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<created>2020-03-03T00:00:00Z</created>
</feed>
output:
format: atom
version: '1.0'