percentageTextShare: percentageTextShare()

in src/lib/server/websearch/scrape/parser.ts [430:489]


				percentageTextShare: percentageTextShare(cluster, totalText),
			};
		});

		// if there is a dominant cluster with more than 60% text share, return that
		const dominantCluster = clusterWithMetrics[0]?.percentageTextShare > 60;
		if (dominantCluster) return [clusterWithMetrics[0].cluster];

		// clusters are sorted by text share after applying a penalty for centrality
		const sortedClusters = clusterWithMetrics.sort((a, b) => {
			const penaltyForA = Math.pow(0.9, a.centrality / 100);
			const penaltyForB = Math.pow(0.9, b.centrality / 100);
			const adjustedTextShareA = a.percentageTextShare * penaltyForA;
			const adjustedTextShareB = b.percentageTextShare * penaltyForB;

			return adjustedTextShareB - adjustedTextShareA;
		});

		// find all clusters that are similar to the largest cluster in terms of text share
		// and see if they are enough to cover at least 60% of the text share
		const largeTextShareClusters = sortedClusters.filter((c) =>
			approximatelyEqual(c.percentageTextShare, sortedClusters[0]?.percentageTextShare, 10)
		);

		const totalTextShareOfLargeClusters = largeTextShareClusters.reduce(
			(acc, cluster) => acc + cluster.percentageTextShare,
			0
		);

		if (totalTextShareOfLargeClusters > 60) {
			return largeTextShareClusters.map((c) => c.cluster);
		}

		// choose clusters till the text share is greater than 60%
		let totalTextShare = 0;
		const criticalClusters = [];
		for (const cluster of sortedClusters) {
			/** Ignore clusters with less than 2%*/
			if (cluster.percentageTextShare < 2) continue;
			if (totalTextShare > 60) break;
			criticalClusters.push(cluster.cluster);
			totalTextShare += cluster.percentageTextShare;
		}

		// if the total text share is less than 60% then return an empty array
		// as this website should not be particularly useful for the web search anyways
		// this should almost never happen on structured website with a lot of text
		if (totalTextShare < 60) {
			return [];
		}

		return criticalClusters;
	};

	const allowListedAttributes = ["href", "src", "alt", "title", "class", "id"];
	function serializeHTMLElement(node: Element): SerializedHTMLElement {
		return {
			tagName: node.tagName.toLowerCase(),
			attributes: allowListedAttributes.reduce(
				(acc, attr) => {