in src/lib/server/websearch/scrape/parser.ts [430:489]
percentageTextShare: percentageTextShare(cluster, totalText),
};
});
// if there is a dominant cluster with more than 60% text share, return that
const dominantCluster = clusterWithMetrics[0]?.percentageTextShare > 60;
if (dominantCluster) return [clusterWithMetrics[0].cluster];
// clusters are sorted by text share after applying a penalty for centrality
const sortedClusters = clusterWithMetrics.sort((a, b) => {
const penaltyForA = Math.pow(0.9, a.centrality / 100);
const penaltyForB = Math.pow(0.9, b.centrality / 100);
const adjustedTextShareA = a.percentageTextShare * penaltyForA;
const adjustedTextShareB = b.percentageTextShare * penaltyForB;
return adjustedTextShareB - adjustedTextShareA;
});
// find all clusters that are similar to the largest cluster in terms of text share
// and see if they are enough to cover at least 60% of the text share
const largeTextShareClusters = sortedClusters.filter((c) =>
approximatelyEqual(c.percentageTextShare, sortedClusters[0]?.percentageTextShare, 10)
);
const totalTextShareOfLargeClusters = largeTextShareClusters.reduce(
(acc, cluster) => acc + cluster.percentageTextShare,
0
);
if (totalTextShareOfLargeClusters > 60) {
return largeTextShareClusters.map((c) => c.cluster);
}
// choose clusters till the text share is greater than 60%
let totalTextShare = 0;
const criticalClusters = [];
for (const cluster of sortedClusters) {
/** Ignore clusters with less than 2%*/
if (cluster.percentageTextShare < 2) continue;
if (totalTextShare > 60) break;
criticalClusters.push(cluster.cluster);
totalTextShare += cluster.percentageTextShare;
}
// if the total text share is less than 60% then return an empty array
// as this website should not be particularly useful for the web search anyways
// this should almost never happen on structured website with a lot of text
if (totalTextShare < 60) {
return [];
}
return criticalClusters;
};
const allowListedAttributes = ["href", "src", "alt", "title", "class", "id"];
function serializeHTMLElement(node: Element): SerializedHTMLElement {
return {
tagName: node.tagName.toLowerCase(),
attributes: allowListedAttributes.reduce(
(acc, attr) => {