Skip to content

Commit

Permalink
DOI: Pull DOIs from URL (zotero#2967)
Browse files Browse the repository at this point in the history
  • Loading branch information
AbeJellinek authored Mar 10, 2023

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
1 parent f3b8dfd commit 48a038d
Showing 1 changed file with 139 additions and 33 deletions.
172 changes: 139 additions & 33 deletions DOI.js
Original file line number Diff line number Diff line change
@@ -9,7 +9,7 @@
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcsibv",
"lastUpdated": "2020-03-13 02:38:54"
"lastUpdated": "2023-01-25 15:27:33"
}

/*
@@ -35,23 +35,58 @@
***** END LICENSE BLOCK *****
*/

// builds a list of DOIs
function getDOIs(doc) {
// TODO Detect DOIs more correctly.
// The actual rules for DOIs are very lax-- but we're more strict.
// Specifically, we should allow space characters, and all Unicode
// characters except for control characters. Here, we're cheating
// by not allowing ampersands, to fix an issue with getting DOIs
// out of URLs.
// Additionally, all content inside <noscript> is picked up as text()
// by the xpath, which we don't necessarily want to exclude, but
// that means that we can get DOIs inside node attributes and we should
// exclude quotes in this case.
// DOI should never end with a period or a comma (we hope)
// Description at: http://www.doi.org/handbook_2000/appendix_1.html#A1-4
const DOIre = /\b10\.[0-9]{4,}\/[^\s&"']*[^\s&"'.,]/g;

var dois = [];
// TODO Detect DOIs more correctly.
// The actual rules for DOIs are very lax-- but we're more strict.
// Specifically, we should allow space characters, and all Unicode
// characters except for control characters. Here, we're cheating
// by not allowing ampersands, to fix an issue with getting DOIs
// out of URLs.
// Additionally, all content inside <noscript> is picked up as text()
// by the xpath, which we don't necessarily want to exclude, but
// that means that we can get DOIs inside node attributes and we should
// exclude quotes in this case.
// DOI should never end with a period or a comma (we hope)
// Description at: http://www.doi.org/handbook_2000/appendix_1.html#A1-4
const DOIre = /\b10\.[0-9]{4,}\/[^\s&"']*[^\s&"'.,]/g;

/**
* @return {string | string[]} A single string if the URL contains a DOI
* and the document contains no others, or an array of DOIs otherwise
*/
function getDOIs(doc, url) {
let fromURL = getDOIFromURL(url);
let fromDocument = getDOIsFromDocument(doc);
if (
// We got a DOI from the URL
fromURL && (
// And none from the document
fromDocument.length == 0
// Or one from the document, but the same one that was in the URL
|| fromDocument.length == 1 && fromDocument[0] == fromURL
)
) {
return fromURL;
}
// De-duplicate before returning
return Array.from(new Set(fromURL ? [fromURL, ...fromDocument] : fromDocument));
}

function getDOIFromURL(url) {
// Split on # and ?, so that we don't allow DOIs to contain those characters
// but do allow finding DOIs on either side of them (e.g. a DOI in the URL hash)
let urlParts = url.split(/[#?]/);
for (let urlPart of urlParts) {
let match = DOIre.exec(urlPart);
if (match) {
// Only return a single DOI from the URL
return match[0];
}
}
return null;
}

function getDOIsFromDocument(doc) {
var dois = new Set();

var m, DOI;
var treeWalker = doc.createTreeWalker(doc.documentElement, 4, null, false);
@@ -69,8 +104,8 @@ function getDOIs(doc) {
DOI = DOI.substr(0, DOI.length - 1);
}
// only add new DOIs
if (!dois.includes(DOI)) {
dois.push(DOI);
if (!dois.has(DOI)) {
dois.add(DOI);
}
}
}
@@ -90,13 +125,13 @@ function getDOIs(doc) {
doi = doi.substr(0, doi.length - 1);
}
// only add new DOIs
if (!dois.includes(doi)) {
dois.push(doi);
if (!dois.has(doi) && !dois.has(doi.replace(/#.*/, ''))) {
dois.add(doi);
}
}
}

return dois;
return Array.from(dois);
}

function detectWeb(doc, url) {
@@ -106,15 +141,20 @@ function detectWeb(doc, url) {
const blacklistRe = /^https?:\/\/[^/]*(?:google\.com|sciencedirect\.com\/science\/advertisement\/)/i;

if (!blacklistRe.test(url)) {
var DOIs = getDOIs(doc);
if (DOIs.length) {
let doiOrDOIs = getDOIs(doc, url);
if (Array.isArray(doiOrDOIs) && doiOrDOIs.length) {
return "multiple";
}
else {
return "journalArticle"; // A decent guess
}
}
return false;
}

function retrieveDOIs(dois) {
function retrieveDOIs(doiOrDOIs) {
let showSelect = Array.isArray(doiOrDOIs);
let dois = showSelect ? doiOrDOIs : [doiOrDOIs];
let items = {};
let numDOIs = dois.length;

@@ -144,7 +184,17 @@ function retrieveDOIs(dois) {
throw new Error("DOI Translator: could not find DOI");
}

// Only show items that resolved successfully
// If showSelect is false, don't show a Select Items dialog,
// just complete if we can
if (!showSelect) {
let firstItem = items[Object.keys(items)[0]];
if (firstItem) {
firstItem.complete();
}
return;
}

// Otherwise, allow the user to select among items that resolved successfully
let select = {};
for (let doi in items) {
let item = items[doi];
@@ -169,10 +219,10 @@ function retrieveDOIs(dois) {
}
}

function doWeb(doc) {
var dois = getDOIs(doc);
Z.debug(dois);
retrieveDOIs(dois);
function doWeb(doc, url) {
let doiOrDOIs = getDOIs(doc, url);
Z.debug(doiOrDOIs);
retrieveDOIs(doiOrDOIs);
}

/** BEGIN TEST CASES **/
@@ -204,8 +254,64 @@ var testCases = [
},
{
"type": "web",
"url": "https://www.developmentbookshelf.com/action/showPublications",
"items": "multiple"
"url": "https://onlinelibrary.wiley.com/doi/full/10.7448/IAS.15.5.18440",
"items": [
{
"itemType": "journalArticle",
"title": "Track C Epidemiology and Prevention Science",
"creators": [],
"date": "2012-10-22",
"DOI": "10.7448/IAS.15.5.18440",
"ISSN": "1758-2652",
"issue": "Suppl 3",
"journalAbbreviation": "Journal of the International AIDS Society",
"libraryCatalog": "DOI.org (Crossref)",
"publicationTitle": "Journal of the International AIDS Society",
"url": "http://doi.wiley.com/10.7448/IAS.15.5.18440",
"volume": "15",
"attachments": [],
"tags": [],
"notes": [],
"seeAlso": []
}
]
},
{
"type": "web",
"url": "https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/BEJTMI",
"items": [
{
"itemType": "document",
"title": "Transfer of 50 thousand improved genetically improved farmed tilapia (GIFT) fry to Nigeria",
"creators": [
{
"lastName": "Trinh",
"firstName": "Trong",
"creatorType": "author"
},
{
"lastName": "Trinh",
"firstName": "Trong",
"creatorType": "contributor"
},
{
"lastName": "WorldFish",
"creatorType": "contributor",
"fieldMode": true
}
],
"date": "2023",
"abstractNote": "The data contains the list of female broodstock that produced improved GIFT fry sent to Nigeria in three batches in 2022",
"extra": "Type: dataset\nDOI: 10.7910/DVN/BEJTMI",
"libraryCatalog": "DOI.org (Datacite)",
"publisher": "Harvard Dataverse",
"url": "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/BEJTMI",
"attachments": [],
"tags": [],
"notes": [],
"seeAlso": []
}
]
}
]
/** END TEST CASES **/

0 comments on commit 48a038d

Please sign in to comment.