315 lines
10 KiB
JavaScript
315 lines
10 KiB
JavaScript
|
"use strict";
|
||
|
const http = require('http');
|
||
|
const https = require('https');
|
||
|
const xml2js = require('xml2js');
|
||
|
const url = require('url');
|
||
|
|
||
|
const fields = require('./fields');
|
||
|
const utils = require('./utils');
|
||
|
|
||
|
const DEFAULT_HEADERS = {
|
||
|
'User-Agent': 'rss-parser',
|
||
|
'Accept': 'application/rss+xml',
|
||
|
}
|
||
|
const DEFAULT_MAX_REDIRECTS = 5;
|
||
|
const DEFAULT_TIMEOUT = 60000;
|
||
|
|
||
|
class Parser {
|
||
|
constructor(options={}) {
|
||
|
options.headers = options.headers || {};
|
||
|
options.xml2js = options.xml2js || {};
|
||
|
options.customFields = options.customFields || {};
|
||
|
options.customFields.item = options.customFields.item || [];
|
||
|
options.customFields.feed = options.customFields.feed || [];
|
||
|
options.requestOptions = options.requestOptions || {};
|
||
|
if (!options.maxRedirects) options.maxRedirects = DEFAULT_MAX_REDIRECTS;
|
||
|
if (!options.timeout) options.timeout = DEFAULT_TIMEOUT;
|
||
|
this.options = options;
|
||
|
this.xmlParser = new xml2js.Parser(this.options.xml2js);
|
||
|
}
|
||
|
|
||
|
parseString(xml, callback) {
|
||
|
let prom = new Promise((resolve, reject) => {
|
||
|
this.xmlParser.parseString(xml, (err, result) => {
|
||
|
if (err) return reject(err);
|
||
|
if (!result) {
|
||
|
return reject(new Error('Unable to parse XML.'));
|
||
|
}
|
||
|
let feed = null;
|
||
|
if (result.feed) {
|
||
|
feed = this.buildAtomFeed(result);
|
||
|
} else if (result.rss && result.rss.$ && result.rss.$.version && result.rss.$.version.match(/^2/)) {
|
||
|
feed = this.buildRSS2(result);
|
||
|
} else if (result['rdf:RDF']) {
|
||
|
feed = this.buildRSS1(result);
|
||
|
} else if (result.rss && result.rss.$ && result.rss.$.version && result.rss.$.version.match(/0\.9/)) {
|
||
|
feed = this.buildRSS0_9(result);
|
||
|
} else if (result.rss && this.options.defaultRSS) {
|
||
|
switch(this.options.defaultRSS) {
|
||
|
case 0.9:
|
||
|
feed = this.buildRSS0_9(result);
|
||
|
break;
|
||
|
case 1:
|
||
|
feed = this.buildRSS1(result);
|
||
|
break;
|
||
|
case 2:
|
||
|
feed = this.buildRSS2(result);
|
||
|
break;
|
||
|
default:
|
||
|
return reject(new Error("default RSS version not recognized."))
|
||
|
}
|
||
|
} else {
|
||
|
return reject(new Error("Feed not recognized as RSS 1 or 2."))
|
||
|
}
|
||
|
resolve(feed);
|
||
|
});
|
||
|
});
|
||
|
prom = utils.maybePromisify(callback, prom);
|
||
|
return prom;
|
||
|
}
|
||
|
|
||
|
parseURL(feedUrl, callback, redirectCount=0) {
|
||
|
let xml = '';
|
||
|
let get = feedUrl.indexOf('https') === 0 ? https.get : http.get;
|
||
|
let urlParts = url.parse(feedUrl);
|
||
|
let headers = Object.assign({}, DEFAULT_HEADERS, this.options.headers);
|
||
|
let timeout = null;
|
||
|
let prom = new Promise((resolve, reject) => {
|
||
|
const requestOpts = Object.assign({headers}, urlParts, this.options.requestOptions);
|
||
|
let req = get(requestOpts, (res) => {
|
||
|
if (this.options.maxRedirects && res.statusCode >= 300 && res.statusCode < 400 && res.headers['location']) {
|
||
|
if (redirectCount === this.options.maxRedirects) {
|
||
|
return reject(new Error("Too many redirects"));
|
||
|
} else {
|
||
|
const newLocation = url.resolve(feedUrl, res.headers['location']);
|
||
|
return this.parseURL(newLocation, null, redirectCount + 1).then(resolve, reject);
|
||
|
}
|
||
|
} else if (res.statusCode >= 300) {
|
||
|
return reject(new Error("Status code " + res.statusCode))
|
||
|
}
|
||
|
let encoding = utils.getEncodingFromContentType(res.headers['content-type']);
|
||
|
res.setEncoding(encoding);
|
||
|
res.on('data', (chunk) => {
|
||
|
xml += chunk;
|
||
|
});
|
||
|
res.on('end', () => {
|
||
|
return this.parseString(xml).then(resolve, reject);
|
||
|
});
|
||
|
})
|
||
|
req.on('error', reject);
|
||
|
timeout = setTimeout(() => {
|
||
|
return reject(new Error("Request timed out after " + this.options.timeout + "ms"));
|
||
|
}, this.options.timeout);
|
||
|
}).then(data => {
|
||
|
clearTimeout(timeout);
|
||
|
return Promise.resolve(data);
|
||
|
}, e => {
|
||
|
clearTimeout(timeout);
|
||
|
return Promise.reject(e);
|
||
|
});
|
||
|
prom = utils.maybePromisify(callback, prom);
|
||
|
return prom;
|
||
|
}
|
||
|
|
||
|
buildAtomFeed(xmlObj) {
|
||
|
let feed = {items: []};
|
||
|
utils.copyFromXML(xmlObj.feed, feed, this.options.customFields.feed);
|
||
|
if (xmlObj.feed.link) {
|
||
|
feed.link = utils.getLink(xmlObj.feed.link, 'alternate', 0);
|
||
|
feed.feedUrl = utils.getLink(xmlObj.feed.link, 'self', 1);
|
||
|
}
|
||
|
if (xmlObj.feed.title) {
|
||
|
let title = xmlObj.feed.title[0] || '';
|
||
|
if (title._) title = title._
|
||
|
if (title) feed.title = title;
|
||
|
}
|
||
|
if (xmlObj.feed.updated) {
|
||
|
feed.lastBuildDate = xmlObj.feed.updated[0];
|
||
|
}
|
||
|
feed.items = (xmlObj.feed.entry || []).map(entry => this.parseItemAtom(entry));
|
||
|
return feed;
|
||
|
}
|
||
|
|
||
|
parseItemAtom(entry) {
|
||
|
let item = {};
|
||
|
utils.copyFromXML(entry, item, this.options.customFields.item);
|
||
|
if (entry.title) {
|
||
|
let title = entry.title[0] || '';
|
||
|
if (title._) title = title._;
|
||
|
if (title) item.title = title;
|
||
|
}
|
||
|
if (entry.link && entry.link.length) {
|
||
|
item.link = utils.getLink(entry.link, 'alternate', 0);
|
||
|
}
|
||
|
if (entry.published && entry.published.length && entry.published[0].length) item.pubDate = new Date(entry.published[0]).toISOString();
|
||
|
if (!item.pubDate && entry.updated && entry.updated.length && entry.updated[0].length) item.pubDate = new Date(entry.updated[0]).toISOString();
|
||
|
if (entry.author && entry.author.length && entry.author[0].name && entry.author[0].name.length) item.author = entry.author[0].name[0];
|
||
|
if (entry.content && entry.content.length) {
|
||
|
item.content = utils.getContent(entry.content[0]);
|
||
|
item.contentSnippet = utils.getSnippet(item.content)
|
||
|
}
|
||
|
if (entry.id) {
|
||
|
item.id = entry.id[0];
|
||
|
}
|
||
|
this.setISODate(item);
|
||
|
return item;
|
||
|
}
|
||
|
|
||
|
buildRSS0_9(xmlObj) {
|
||
|
var channel = xmlObj.rss.channel[0];
|
||
|
var items = channel.item;
|
||
|
return this.buildRSS(channel, items);
|
||
|
}
|
||
|
|
||
|
buildRSS1(xmlObj) {
|
||
|
xmlObj = xmlObj['rdf:RDF'];
|
||
|
let channel = xmlObj.channel[0];
|
||
|
let items = xmlObj.item;
|
||
|
return this.buildRSS(channel, items);
|
||
|
}
|
||
|
|
||
|
buildRSS2(xmlObj) {
|
||
|
let channel = xmlObj.rss.channel[0];
|
||
|
let items = channel.item;
|
||
|
let feed = this.buildRSS(channel, items);
|
||
|
if (xmlObj.rss.$ && xmlObj.rss.$['xmlns:itunes']) {
|
||
|
this.decorateItunes(feed, channel);
|
||
|
}
|
||
|
return feed;
|
||
|
}
|
||
|
|
||
|
buildRSS(channel, items) {
|
||
|
items = items || [];
|
||
|
let feed = {items: []};
|
||
|
let feedFields = fields.feed.concat(this.options.customFields.feed);
|
||
|
let itemFields = fields.item.concat(this.options.customFields.item);
|
||
|
if (channel['atom:link'] && channel['atom:link'][0] && channel['atom:link'][0].$) {
|
||
|
feed.feedUrl = channel['atom:link'][0].$.href;
|
||
|
}
|
||
|
if (channel.image && channel.image[0] && channel.image[0].url) {
|
||
|
feed.image = {};
|
||
|
let image = channel.image[0];
|
||
|
if (image.link) feed.image.link = image.link[0];
|
||
|
if (image.url) feed.image.url = image.url[0];
|
||
|
if (image.title) feed.image.title = image.title[0];
|
||
|
if (image.width) feed.image.width = image.width[0];
|
||
|
if (image.height) feed.image.height = image.height[0];
|
||
|
}
|
||
|
utils.copyFromXML(channel, feed, feedFields);
|
||
|
feed.items = items.map(xmlItem => this.parseItemRss(xmlItem, itemFields));
|
||
|
return feed;
|
||
|
}
|
||
|
|
||
|
parseItemRss(xmlItem, itemFields) {
|
||
|
let item = {};
|
||
|
utils.copyFromXML(xmlItem, item, itemFields);
|
||
|
if (xmlItem.enclosure) {
|
||
|
item.enclosure = xmlItem.enclosure[0].$;
|
||
|
}
|
||
|
if (xmlItem.description) {
|
||
|
item.content = utils.getContent(xmlItem.description[0]);
|
||
|
item.contentSnippet = utils.getSnippet(item.content);
|
||
|
}
|
||
|
if (xmlItem.guid) {
|
||
|
item.guid = xmlItem.guid[0];
|
||
|
if (item.guid._) item.guid = item.guid._;
|
||
|
}
|
||
|
if (xmlItem.category) item.categories = xmlItem.category;
|
||
|
this.setISODate(item);
|
||
|
return item;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Add iTunes specific fields from XML to extracted JSON
|
||
|
*
|
||
|
* @access public
|
||
|
* @param {object} feed extracted
|
||
|
* @param {object} channel parsed XML
|
||
|
*/
|
||
|
decorateItunes(feed, channel) {
|
||
|
let items = channel.item || [];
|
||
|
let categories = [];
|
||
|
feed.itunes = {}
|
||
|
|
||
|
if (channel['itunes:owner']) {
|
||
|
let owner = {};
|
||
|
|
||
|
if(channel['itunes:owner'][0]['itunes:name']) {
|
||
|
owner.name = channel['itunes:owner'][0]['itunes:name'][0];
|
||
|
}
|
||
|
if(channel['itunes:owner'][0]['itunes:email']) {
|
||
|
owner.email = channel['itunes:owner'][0]['itunes:email'][0];
|
||
|
}
|
||
|
feed.itunes.owner = owner;
|
||
|
}
|
||
|
|
||
|
if (channel['itunes:image']) {
|
||
|
let image;
|
||
|
let hasImageHref = (channel['itunes:image'][0] &&
|
||
|
channel['itunes:image'][0].$ &&
|
||
|
channel['itunes:image'][0].$.href);
|
||
|
image = hasImageHref ? channel['itunes:image'][0].$.href : null;
|
||
|
if (image) {
|
||
|
feed.itunes.image = image;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (channel['itunes:category']) {
|
||
|
const categoriesWithSubs = channel['itunes:category'].map((category) => {
|
||
|
return {
|
||
|
name: category.$.text,
|
||
|
subs: category['itunes:category'] ?
|
||
|
category['itunes:category']
|
||
|
.map((subcategory) => ({ name: subcategory.$.text })) : null,
|
||
|
};
|
||
|
});
|
||
|
|
||
|
feed.itunes.categories = categoriesWithSubs.map((category) => category.name);
|
||
|
feed.itunes.categoriesWithSubs = categoriesWithSubs;
|
||
|
}
|
||
|
|
||
|
if (channel['itunes:keywords']) {
|
||
|
if (channel['itunes:keywords'].length > 1) {
|
||
|
feed.itunes.keywords = channel['itunes:keywords'].map(
|
||
|
keyword => keyword.$.text
|
||
|
);
|
||
|
} else {
|
||
|
let keywords = channel['itunes:keywords'][0];
|
||
|
if (keywords && typeof keywords._ === 'string') {
|
||
|
keywords = keywords._;
|
||
|
}
|
||
|
|
||
|
if (keywords && keywords.$ && keywords.$.text) {
|
||
|
feed.itunes.keywords = keywords.$.text.split(',')
|
||
|
} else if (typeof keywords === "string") {
|
||
|
feed.itunes.keywords = keywords.split(',');
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
utils.copyFromXML(channel, feed.itunes, fields.podcastFeed);
|
||
|
items.forEach((item, index) => {
|
||
|
let entry = feed.items[index];
|
||
|
entry.itunes = {};
|
||
|
utils.copyFromXML(item, entry.itunes, fields.podcastItem);
|
||
|
let image = item['itunes:image'];
|
||
|
if (image && image[0] && image[0].$ && image[0].$.href) {
|
||
|
entry.itunes.image = image[0].$.href;
|
||
|
}
|
||
|
});
|
||
|
}
|
||
|
|
||
|
setISODate(item) {
|
||
|
let date = item.pubDate || item.date;
|
||
|
if (date) {
|
||
|
try {
|
||
|
item.isoDate = new Date(date.trim()).toISOString();
|
||
|
} catch (e) {
|
||
|
// Ignore bad date format
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
module.exports = Parser;
|