You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
59 lines
1.8 KiB
JavaScript
59 lines
1.8 KiB
JavaScript
// CLEAN AUTHOR CONSTANTS
|
|
export const CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i;
|
|
// author = re.sub(r'^\s*(posted |written )?by\s*:?\s*(.*)(?i)',
|
|
|
|
// CLEAN DEK CONSTANTS
|
|
export const TEXT_LINK_RE = new RegExp('http(s)?://', 'i');
|
|
// An ordered list of meta tag names that denote likely article deks.
|
|
// From most distinct to least distinct.
|
|
//
|
|
// NOTE: There are currently no meta tags that seem to provide the right
|
|
// content consistenty enough. Two options were:
|
|
// - og:description
|
|
// - dc.description
|
|
// However, these tags often have SEO-specific junk in them that's not
|
|
// header-worthy like a dek is. Excerpt material at best.
|
|
export const DEK_META_TAGS = [
|
|
];
|
|
|
|
// An ordered list of Selectors to find likely article deks. From
|
|
// most explicit to least explicit.
|
|
//
|
|
// Should be more restrictive than not, as a failed dek can be pretty
|
|
// detrimental to the aesthetics of an article.
|
|
export const DEK_SELECTORS = [
|
|
'.entry-summary',
|
|
];
|
|
|
|
// CLEAN DATE PUBLISHED CONSTANTS
|
|
export const CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i;
|
|
export const TIME_MERIDIAN_SPACE_RE = /(.*\d)(am|pm)(.*)/i;
|
|
export const TIME_MERIDIAN_DOTS_RE = /\.m\./i;
|
|
const months = [
|
|
'jan',
|
|
'feb',
|
|
'mar',
|
|
'apr',
|
|
'may',
|
|
'jun',
|
|
'jul',
|
|
'aug',
|
|
'sep',
|
|
'oct',
|
|
'nov',
|
|
'dec',
|
|
];
|
|
const allMonths = months.join('|');
|
|
const timestamp1 = '[0-9]{1,2}:[0-9]{2,2}( ?[ap].?m.?)?';
|
|
const timestamp2 = '[0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4}';
|
|
export const SPLIT_DATE_STRING =
|
|
new RegExp(`(${timestamp1})|(${timestamp2})|([0-9]{1,4})|(${allMonths})`, 'ig');
|
|
|
|
// CLEAN TITLE CONSTANTS
|
|
// A regular expression that will match separating characters on a
|
|
// title, that usually denote breadcrumbs or something similar.
|
|
export const TITLE_SPLITTERS_RE = /(: | - | \| )/g;
|
|
|
|
export const DOMAIN_ENDINGS_RE =
|
|
new RegExp('\.com$|\.net$|\.org$|\.co\.uk$', 'g');
|