You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mercury-parser/src/extractors/generic/word-count/extractor.js

28 lines
689 B
JavaScript

import cheerio from 'cheerio';
import { normalizeSpaces } from 'utils/text';
const getWordCount = content => {
const $ = cheerio.load(content);
const $content = $('div').first();
const text = normalizeSpaces($content.text());
return text.split(/\s/).length;
};
const getWordCountAlt = content => {
content = content.replace(/<[^>]*>/g, ' ');
content = content.replace(/\s+/g, ' ');
content = content.trim();
return content.split(' ').length;
};
const GenericWordCountExtractor = {
extract({ content }) {
let count = getWordCount(content);
if (count === 1) count = getWordCountAlt(content);
return count;
},
};
export default GenericWordCountExtractor;