Custom Extractor for clinicaltrials.gov (#305)

* Add prototype of custom extractor for clinicaltrials.gov

* Add .DS_Store to gitignore

* Make tests for title, author and date_published selectors pass

* Make content selector test pass

* Fix date_published test

* Rebuild

* Remove .DS-Store from gitignore

* Improve extractor and text/fixture of clinicaltrials.gov
pull/430/head
david0leong 5 years ago committed by Toufic Mouallem
parent a7cd9027e2
commit 694ea820aa

File diff suppressed because it is too large Load Diff

@ -0,0 +1,29 @@
export const ClinicaltrialsGovExtractor = {
domain: 'clinicaltrials.gov',
title: {
selectors: ['h1.tr-solo_record'],
},
author: {
selectors: ['div#sponsor.tr-info-text'],
},
date_published: {
// selectors: ['span.term[data-term="Last Update Posted"]'],
selectors: ['div:has(> span.term[data-term="Last Update Posted"])'],
},
content: {
selectors: ['div#tab-body'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: ['.usa-alert> img'],
},
};

@ -0,0 +1,91 @@
import assert from 'assert';
import URL from 'url';
import cheerio from 'cheerio';
import moment from 'moment-timezone';
import Mercury from 'mercury';
import getExtractor from 'extractors/get-extractor';
import { excerptContent } from 'utils/text';
const fs = require('fs');
describe('ClinicaltrialsGovExtractor', () => {
describe('initial test case', () => {
let result;
let url;
beforeAll(() => {
url =
'https://clinicaltrials.gov/ct2/show/NCT03746262?term=Guardant+Health';
const html = fs.readFileSync(
'./fixtures/clinicaltrials.gov/1551708504719.html'
);
result = Mercury.parse(url, { html, fallback: false });
});
it('is selected properly', () => {
// This test should be passing by default.
// It sanity checks that the correct parser
// is being selected for URLs from this domain
const extractor = getExtractor(url);
assert.equal(extractor.domain, URL.parse(url).hostname);
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/clinicaltrials.gov/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
assert.equal(
title,
`Changes in Circulating Tumor-Specific DNA in Patients With Non-Metastatic Non-Small Cell Lung Cancer`
);
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/clinicaltrials.gov/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
assert.equal(author, 'Wake Forest University Health Sciences');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/clinicaltrials.gov/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
assert.equal(moment(date_published).format('YYYY-MM-DD'), '2018-11-21');
});
it('returns the content', async () => {
// To pass this test, fill out the content selector
// in ./src/extractors/custom/clinicaltrials.gov/index.js.
// You may also want to make use of the clean and transform
// options.
const { content } = await result;
const $ = cheerio.load(content || '');
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(
first13,
'The purpose of this research study is to evaluate a blood test to'
);
});
});
});

@ -93,6 +93,7 @@ export * from './www.fortinet.com';
export * from './www.fastcompany.com';
export * from './blisterreview.com';
export * from './news.mynavi.jp';
export * from './clinicaltrials.gov';
export * from './github.com';
export * from './www.reddit.com';
export * from './otrs.com';

@ -109,10 +109,9 @@ describe('Resource', () => {
() => {
Resource.generateDoc({ body, response });
},
err => (
(err instanceof Error) &&
err =>
err instanceof Error &&
/content does not appear to be text/i.test(err)
)
);
});

Loading…
Cancel
Save