in source/app/utils/document.js [439:567]
export function getPageWordsBySearch(document, pageNumber, searchQuery) {
if (!searchQuery) return []
const searchQueryWords = reject(isEmpty, searchQuery.split(' '))
function escapeRegex(str) {
return str.replace(/[-[\]{}()*+?.,\\^$|]/g, '\\$&')
}
const searchQueryRegex = RegExp(escapeRegex(searchQuery), 'i')
// Get all the LINE Blocks for a PAGE that match the searchQuery
const lines = getPageChildrenByType(document, pageNumber, 'LINE')
const matchingLines = lines.filter(({ Text }) => searchQueryRegex.test(Text))
// Get all the WORD Blocks for each LINE that match the searchQuery
const matchingWords = matchingLines.reduce((accumulator, { Relationships }) => {
const wordIds = path([0, 'Ids'], Relationships) || []
// Sort all the WORD Blocks in order from left to right
const wordBlocks = sortWith([ascend(path(['Geometry', 'BoundingBox', 'Left']))])(
getDocumentBlocksByIds(document, wordIds)
)
const wordText = wordBlocks.map(word => word.Text).join(' ')
const matchRegexp = RegExp(escapeRegex(searchQueryWords.join(' ')), 'ig')
let res
function getWordIndexByStringIndex(idx) {
let searchIndex = 0
let wordIndex
for (wordIndex = 0; wordIndex < wordBlocks.length; wordIndex++) {
const word = wordBlocks[wordIndex]
if (idx < searchIndex + word.Text.length + 1) return wordIndex
searchIndex += word.Text.length + 1
}
return wordIndex
}
const matchingWordBlocks = []
while ((res = matchRegexp.exec(wordText)) !== null) {
const startIndex = res.index
const endIndex = startIndex + res[0].length
const startWord = getWordIndexByStringIndex(startIndex)
const endWord = getWordIndexByStringIndex(endIndex)
for (let i = startWord; i <= endWord; i++) {
if (!matchingWordBlocks.includes(wordBlocks[i])) matchingWordBlocks.push(wordBlocks[i])
}
}
// TODO most of the below logic can probably be removed / consolidated into the above
// Pick specific props from each WORD Block
const matchingWordBounds = matchingWordBlocks.map(
({
Text,
Geometry: {
BoundingBox: { Top, Left, Width, Height },
},
}) => ({
Text,
Top,
Left,
Width,
Height,
})
)
// Sort all the words by their location from top/left to bottom/right
const matchingWordBoundsSorted = sortWith([
({ Top: a }, { Top: b }) => {
const difference = pipe(
Math.abs,
multiply(100),
Math.floor
)(a - b)
return !difference ? 0 : a < b ? -1 : a > b ? 1 : 0
},
ascend(path(['Left'])),
])(matchingWordBounds)
// Combine words together such that they match the query (and merge their bounding box info)
let unmatched = null
const matchingWordBoundsCombined = matchingWordBoundsSorted.reduce((accumulator, word) => {
const wordMatches = searchQueryRegex.test(word.Text)
// If a single word matches the query, add it to the list
if (wordMatches) {
unmatched = null
return [...accumulator, word]
}
// If there's an unmatched word from a previous iteration,
// see if combining the two will match the query
if (unmatched) {
const combinedText = `${unmatched.Text} ${word.Text}`
const combinedWords = {
Text: combinedText,
Top: Math.max(unmatched.Top, word.Top),
Left: Math.min(unmatched.Left, word.Left),
Width: word.Left - unmatched.Left + word.Width,
Height: Math.max(unmatched.Height, word.Height),
}
const combinedWordsMatch = searchQueryRegex.test(combinedText)
// If the combined words match the query, add it to the list
if (combinedWordsMatch) {
unmatched = null
return [...accumulator, combinedWords]
}
// Otherwise, update unmatched with the combined words object
unmatched = combinedWords
} else {
// If there wasn't an unmatched word from a previous iteration, set unmatched
unmatched = word
}
return accumulator
}, [])
return accumulator.concat(matchingWordBoundsCombined)
}, [])
return matchingWords
}