in app/pypi-recent/lib/parser.js [73:187]
Parser.prototype.extractHeaderInfo = function(context) {
var self = this;
// Preprocess the target a bit to remove extra whitespace
// and other characters that can confuse the NLP
var target = context[0];
// Remove the href of any link in the line, because many links contain
// version numbers from tags or branch names which can confuse the parser.
var links = target.match(/\[([^\]]+)\]\(([^)]+)\)/);
while (links) {
target = target.substr(0, links.index) + links[1] + target.substr(links.index + links[0].length);
links = target.match(/\[([^\]]+)\]\(([^)]+)\)/);
}
// Determine the markup weight of the target line.
var markupWeight;
if (target.slice(0, 3) === '###') {
// Check to see if the target line is an h3
markupWeight = 3;
} else if (target.slice(0, 2) === '##') {
// Check to see if the target line is an h2
markupWeight = 2;
} else if (target.slice(0, 1) === '#') {
// Check to see if the target line is an h1
markupWeight = 1;
} else {
// Check to see if the line below the target line is an underline, formed with either
// the equals sign or the dash symbol.
if (context[1]) {
var underlines = (context[1].match(/[=|-]+/g) || []);
if (underlines && underlines.length) {
let count = underlines[0].length;
if (count > context[1].length * 0.8) {
// Line below is more than 80% =, so probably an underline indicator.
markupWeight = 1;
} else {
// Nothing special about the line below, therefore nothing special about this line.
markupWeight = 0;
}
}
}
}
// Find a version number in the line
var versions = findVersions(target, { loose: true });
var version;
if (_.isArray(versions)) {
// If the date format is like YYYY.MM.DD then it can result in
// the date being grabbed as a version number. So loop through the
// possible versions and reject any in which the major version number
// is excessively large.
for (var i in versions) {
var parsed = parseSemVer(versions[i]);
// Filter out dates in the format DD.MM.YYYY and YYYY.MM.DD
// These dates end up looking like version numbers, but their extra
// large major or patch versions are a good signal that they are
// actually dates.
if (parsed.major < 1900 && parsed.patch < 1900) {
version = versions[i];
break;
}
}
}
if (!version) {
// Don't bother doing a CPU laden bruteforce search for a date if
// there is no version number.
return null;
}
var dateTarget = target;
// Remove identified versions from the date search string to prevent them
// from being falsely interpreted as dates
var looseVersions = dateTarget.match(versionNumber);
var v;
// Filter out the full semantic versions
for (v of versions) {
dateTarget = dateTarget.replace(v, '');
}
// Also filter out versions like 1.2 which the "loose" parser picks up and then
// coerces to their correct semver version like 1.2.0
for (v of looseVersions) {
dateTarget = dateTarget.replace(v, '');
}
// Clean up date formats a bit?
dateTarget = dateTarget.replace(/ +(?= )/g, '');
dateTarget = dateTarget.replace(/(\(|\)|\:)/g, '');
dateTarget = dateTarget.replace(/(\-)/g, '/');
dateTarget = dateTarget.replace(/(\.)/g, '/');
dateTarget = dateTarget.replace('now', ''); // The date parser is a little too good ;)
dateTarget = dateTarget.replace('week', ''); // So we must cripple it a bit
// Do a brute force search for a date
//console.log(target);
//console.log('Found version', version);
//console.log(dateTarget);
var date = self.dateSearch(dateTarget);
//console.log('Found date', date);
if ((version && date > 0) || (version && markupWeight > 0)) {
return {
version: version,
date: date
};
} else {
return null;
}
};