scripts/origin_content_classifier.py (71 lines of code) (raw):

#!/usr/bin/env python # -*- coding: utf-8 -*- """ """ # prompt: for each json file in directory D, run function "dothis" import json import os import requests #import sys # Web Content string fragments # <link rel="preconnect" or <link rel=preconnect # <link rel="dns-prefetch" # https://developer.mozilla.org/en-US/docs/Web/Performance/Speculative_loading # https://developer.mozilla.org/en-US/docs/Web/Performance/dns-prefetch # https://fetch.spec.whatwg.org/#concept-request-destination mb_dnsprefetch = [ 'rel="dns-prefetch"', 'rel=dns-prefetch' ] mb_preconnect = [ 'rel=preconnect', 'rel="preconnect"' ] mb_preload = [ 'rel="preload"', 'rel=preload' ] mb_prefetch = [ 'rel="prefetch"', 'rel=prefetch' ] mb_prerender = [ 'rel="prerender"', 'rel=prerender' ] # google publisher tag # https://developers.google.com/publisher-tag mb_gpt = [ 'gpt.js' ] # https://datatracker.ietf.org/doc/draft-ietf-httpbis-compression-dictionary/ mb_compressiondict = [ 'rel="compression-dictionary"', 'rel=compression-dictionary' ] mh_compressiondict = [ 'Use-As-Dictionary', 'Available-Dictionary', 'Dictionary-ID' ] # Take an input url and return dictionary of classifier types. def classify_origin(r, tag, matchdict, field): if field == "headers": sfield = dict(r.headers); if field == "text": sfield = r.text; matchp = 0 for item in matchdict: if item in sfield: matchp = 1 return matchp def classify_sitelist(file, tag, matchdict, field): """ This is a placeholder function. Replace this with your actual logic. """ try: with open(file, 'r') as f: # Process the JSON data here #print(f"file: {file}") data = json.load(f) data_url = data['url'] #data_text = data['text'] data_field = data[field] #print(data.keys()) # data = { # 'url': r.request.url, # 'text': r.text, # 'headers': dict(r.headers), # 'status_code': r.status_code, # 'datetime': datetime.datetime.now().isoformat(), #} matchp = False for item in matchdict: if item in data_field: matchp = True #print(f"item: {item}") #print(f"txt: {data_text}") if matchp: with open("response_" + field + "_matches_" + tag + ".txt", "a") as ofile: ofile.write(data_url + '\n') except json.JSONDecodeError as e: print(f"Error decoding JSON in {file}: {e}") except Exception as e: print(f"Error processing {file}: {e}") # Assumes input directory is the base directory of a sitelist scan that serialized responses. # aka, results from run of origin_reachable_and_response.py def classify_json_files(directory, tag, matchdict, field): origincount=1 for filename in os.listdir(directory): if filename.endswith(".json"): filepath = os.path.join(directory, filename) classify_sitelist(filepath, tag, matchdict, field) origincount += 1 print("sites total: " + str(origincount)) def classify_web_content_sitelists(idir): classify_json_files(idir, "dns-prefetch", mb_dnsprefetch, "text") classify_json_files(idir, "preconnect", mb_preconnect, "text") classify_json_files(idir, "preload", mb_preload, "text") classify_json_files(idir, "prefetch", mb_prefetch, "text") classify_json_files(idir, "prerender", mb_prerender, "text") classify_json_files(idir, "compression-dictionary", mb_compressiondict, "text") classify_json_files(idir, "compression-dictionary", mh_compressiondict, "headers") classify_json_files(idir, "google-publisher-tag", mb_gpt, "text") def classify_web_content_traits(url): tdict = { } r = requests.get(url, timeout=10) compressiondictp = 0 ztp = classify_origin(r, "compression-dictionary", mb_compressiondict, "text") zhp = classify_origin(r, "compression-dictionary-header", mh_compressiondict, "headers") if ztp or zhp: compressiondictp = 1 tdict["compression-dictionary"] = compressiondictp; tdict["dns-prefetch"] = classify_origin(r, "dns-prefetch", mb_dnsprefetch, "text") tdict["google-publisher-tag"] = classify_origin(r, "google-publisher-tag", mb_gpt, "text") tdict["preconnect"] = classify_origin(r, "preconnect", mb_preconnect, "text") tdict["prefetch"] = classify_origin(r, "prefetch", mb_prefetch, "text") tdict["preload"] = classify_origin(r, "preload", mb_preload, "text") tdict["prerender"] = classify_origin(r, "prerender", mb_prerender, "text") return tdict