search/_scripts/index.js (162 lines of code) (raw):

#!/usr/bin/env node 'use strict'; // Script includes const fs = require('fs'); const el = require('elasticlunr'); const recursive = require('recursive-readdir'); const crypto = require('crypto'); const optionParser = require('command-line-args'); const path = require('path'); const cheerio = require('cheerio'); const VERSION = '0.1'; const DESCRIPTION = 'Script to build the "elasticlunr" search index.' const SCRIPT_NAME = path.basename(process.argv[1]); /** * Calculates the md5 hash of a string and returns the * result as an integer. */ function hashString(str) { var md5 = crypto.createHash('md5'); var hex_digest = md5.update(str).digest('hex'); // Convert to integer return parseInt(hex_digest.slice(0, 8), 16); } function mkdirp(directory) { fs.existsSync(directory) || fs.mkdirSync(directory); } /** * Truncates a string to a maximum number of words. */ function truncateWords(str, words) { return str.split(/\b/).slice(0, words * 2).join(''); } const optionParams = [ {name: 'index-dir', alias: 'd', type: String, defaultOption: true}, {name: 'output', alias: 'o', type: String, defaultValue: './index.json'}, {name: 'version', alias: 'v', type: Boolean}, {name: 'verbose', alias: 'V', type: Boolean}, {name: 'help', alias: 'h', type: Boolean} ]; const options = optionParser(optionParams); if (options.help) { console.log(SCRIPT_NAME + ' - ' + DESCRIPTION + '\n'); console.log( 'Usage: ' + SCRIPT_NAME + ' INDEX-DIR [ --output OUT-FILE --verbose]' ); process.exit(); } if (options.version) { console.log(process.argv[0] + ' - v' + VERSION); process.exit(); } function padLeft(str, char, length) { if(typeof str !== "undefined") return str + char.repeat(Math.max(0, length - str.length)); else return ''; } // find all unique file extensions in bash using: // for a in `find content -type f -print `; do filename=`basename $a`; fileext=${filename##*.}; echo "$fileext"; done | sort -u // TODO: should build this up dynamically instead var ignore = [ '*.PNG', '*.cache', '*.css', '*.docx', '*.eot', '*.gif', '*.jar', '*.jpg', '*.js', '*.less', '*.odp', '*.otf', '*.pdf', '*.pdn', '*.png', '*.ppt', '*.pptx', '*.rb', '*.rdf', '*.scss', '*.svg', '*.ttf', '*.woff', '*.woff2', '*.xml', '*.xsd' ]; // Create the index schema. var index = el(function () { this.addField('title'), this.addField('body'), this.addField('description'), this.addField('url'), this.setRef('id') }); var indexDir = options['index-dir']; recursive(indexDir, ignore, function (err, files) { if (err) { console.log(err); return; } for (var file of files) { var html = fs.readFileSync(file, 'utf-8'); file = file.replace(/\\/g, '/'); file = file.replace(new RegExp('^' + options['index-dir']), ''); if (options.verbose) { console.log("") console.log("Parsing: " + file); } // Parse the DOM. var $ = cheerio.load(html, { normalizeWhitespace: true, xmlMode: false }); var withIds = $( '[id]' ); $(withIds).each( function(i, withId) { //var sectionHeading = $(withId).prevAll('[id]') var sectionId = $(withId).attr('id'); var childIds = $(withId).children('[id]'); var isHeading = (withId.name === "h1" || withId.name === "h2" || withId.name === "h3" || withId.name === "h4" || withId.name === "h5" || withId.name === "h6") // has no child sections if(isHeading && childIds.length == 0) { var headingWithId = withId; var sectionTitle = $(headingWithId).text(); sectionTitle = sectionTitle.replace(new RegExp('^[0-9\.]*\. '), ''); var sectionDiv = $(headingWithId).parent('div')[0] var sectionParagraphs = $(sectionDiv).children('div.paragraph') var firstPara = $(sectionParagraphs).length > 0 ? $(sectionParagraphs[0]).text() : "" var sectionDescription = truncateWords(firstPara, 50); var sectionBodyText =sectionTitle + " " + sectionParagraphs.text() var sectionUrl = file + "#" + sectionId var docId = hashString(sectionUrl) var doc = { title: sectionTitle, url: sectionUrl, body: sectionBodyText, description: sectionDescription, id: docId }; if (options.verbose) { console.log(" " + padLeft(sectionId, ' ', 40) + ": " + sectionTitle); /* console.log(" sectionTitle : " + sectionTitle); console.log(" sectionUrl : " +sectionUrl); console.log(" id : " + docId); console.log(" sectionDescription: " + sectionDescription); console.log(" sectionBodyText : " + sectionBodyText); console.log("") */ } index.addDoc(doc); } }); } // Serialise and write the index. var out = index.toJSON(); /* // Remove the body field from the documentStore to decrease the size of the index. for (var id in out.documentStore.docs) { delete out.documentStore.docs[id].body; } */ if (options.verbose) { console.log("Serialising to: " + options.output) } var outputFile = options.output; var outputFileParse = path.parse(outputFile); mkdirp(outputFileParse.dir) fs.writeFileSync(outputFile, JSON.stringify(out), 'utf-8'); if (options.verbose) { console.log('done'); } process.exit(); });