export function getPageWordsBySearch()

in source/app/utils/document.js [439:567]


export function getPageWordsBySearch(document, pageNumber, searchQuery) {
  if (!searchQuery) return []
  const searchQueryWords = reject(isEmpty, searchQuery.split(' '))

  function escapeRegex(str) {
    return str.replace(/[-[\]{}()*+?.,\\^$|]/g, '\\$&')
  }

  const searchQueryRegex = RegExp(escapeRegex(searchQuery), 'i')

 
  
  // Get all the LINE Blocks for a PAGE that match the searchQuery
  const lines = getPageChildrenByType(document, pageNumber, 'LINE')
  const matchingLines = lines.filter(({ Text }) => searchQueryRegex.test(Text))

  // Get all the WORD Blocks for each LINE that match the searchQuery
  const matchingWords = matchingLines.reduce((accumulator, { Relationships }) => {
    const wordIds = path([0, 'Ids'], Relationships) || []

    // Sort all the WORD Blocks in order from left to right
    const wordBlocks = sortWith([ascend(path(['Geometry', 'BoundingBox', 'Left']))])(
      getDocumentBlocksByIds(document, wordIds)
    )
    
    const wordText = wordBlocks.map(word => word.Text).join(' ')

    const matchRegexp = RegExp(escapeRegex(searchQueryWords.join(' ')), 'ig')
    let res

    function getWordIndexByStringIndex(idx) {
      let searchIndex = 0
      let wordIndex
      for (wordIndex = 0; wordIndex < wordBlocks.length; wordIndex++) {
        const word = wordBlocks[wordIndex]

        if (idx < searchIndex + word.Text.length + 1) return wordIndex
        searchIndex += word.Text.length + 1
      }
      return wordIndex
    }

    const matchingWordBlocks = []
    while ((res = matchRegexp.exec(wordText)) !== null) {
      const startIndex = res.index
      const endIndex = startIndex + res[0].length
      const startWord = getWordIndexByStringIndex(startIndex)
      const endWord = getWordIndexByStringIndex(endIndex)

      for (let i = startWord; i <= endWord; i++) {
        if (!matchingWordBlocks.includes(wordBlocks[i])) matchingWordBlocks.push(wordBlocks[i])
      }
    }

    // TODO most of the below logic can probably be removed / consolidated into the above

    // Pick specific props from each WORD Block
    const matchingWordBounds = matchingWordBlocks.map(
      ({
        Text,
        Geometry: {
          BoundingBox: { Top, Left, Width, Height },
        },
      }) => ({
        Text,
        Top,
        Left,
        Width,
        Height,
      })
    )

    // Sort all the words by their location from top/left to bottom/right
    const matchingWordBoundsSorted = sortWith([
      ({ Top: a }, { Top: b }) => {
        const difference = pipe(
          Math.abs,
          multiply(100),
          Math.floor
        )(a - b)
        return !difference ? 0 : a < b ? -1 : a > b ? 1 : 0
      },
      ascend(path(['Left'])),
    ])(matchingWordBounds)

    // Combine words together such that they match the query (and merge their bounding box info)
    let unmatched = null
    const matchingWordBoundsCombined = matchingWordBoundsSorted.reduce((accumulator, word) => {
      const wordMatches = searchQueryRegex.test(word.Text)

      // If a single word matches the query, add it to the list
      if (wordMatches) {
        unmatched = null
        return [...accumulator, word]
      }

      // If there's an unmatched word from a previous iteration,
      // see if combining the two will match the query
      if (unmatched) {
        const combinedText = `${unmatched.Text} ${word.Text}`
        const combinedWords = {
          Text: combinedText,
          Top: Math.max(unmatched.Top, word.Top),
          Left: Math.min(unmatched.Left, word.Left),
          Width: word.Left - unmatched.Left + word.Width,
          Height: Math.max(unmatched.Height, word.Height),
        }
        const combinedWordsMatch = searchQueryRegex.test(combinedText)

        // If the combined words match the query, add it to the list
        if (combinedWordsMatch) {
          unmatched = null
          return [...accumulator, combinedWords]
        }

        // Otherwise, update unmatched with the combined words object
        unmatched = combinedWords
      } else {
        // If there wasn't an unmatched word from a previous iteration, set unmatched
        unmatched = word
      }

      return accumulator
    }, [])

    return accumulator.concat(matchingWordBoundsCombined)
  }, [])
  return matchingWords
}