find_examples_manpage_data.py (90 lines of code) (raw):
import argparse
import html.parser
import os
import re
import pandas as pd
from tqdm.auto import tqdm
class HTMLTextParser(html.parser.HTMLParser):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.data = []
def handle_data(self, data):
self.data.append(data)
def check_query(query):
return query[0].isupper()
def get_examples(command):
name = command['name']
examples = []
regex_name = re.compile(fr"\b{re.escape(name)}\b")
for p in command['paragraphs']:
text = p['text'].strip()
parser = HTMLTextParser()
parser.feed(text)
p['text'] = "".join(parser.data)
query = ""
for i, p in enumerate(command['paragraphs']):
if p['section'] is None:
continue
section = p['section'].lower()
if not "example" in section:
continue
example = p['text']
found = False
if example.startswith(name):
found = True
if not found and example.startswith("$ "):
example = example[2:]
found = True
if not found:
for match in regex_name.finditer(example):
if match.start(0) > 10:
query = example[:match.start(0)].strip()
example = example[match.start(0):]
found = True
break
if not found:
continue
example = re.sub(r"\\\s+", "", example)
if example.find('\n') != -1:
last_index = example.find('\n')
if check_query(example[last_index:].strip()):
query = example[last_index:].strip()
example = example[:last_index].strip()
if i > 0:
line = command['paragraphs'][i - 1]['text']
if line.endswith(":"):
query = line[:-1]
if not query and i + 1 < len(command['paragraphs']):
line = command['paragraphs'][i + 1]['text']
if check_query(line):
query = line
examples.append([name, example, query])
query = ""
return examples
def main(args):
if not os.path.isfile(args.input):
raise ValueError(f"Can't find file '{args.input}'")
chunk_size = args.chunk_size
data = pd.read_json(args.input, lines=True, chunksize=chunk_size)
TOTAL_LINES = 36668
total_iterations = (TOTAL_LINES + chunk_size - 1) // chunk_size
examples = []
examples_with_query = 0
with tqdm(data, total=total_iterations) as progress_bar:
for chunk in progress_bar:
for i, command in chunk.iterrows():
new_examples = get_examples(command)
for ex in new_examples:
if ex[2]:
examples_with_query += 1
examples.extend(new_examples)
progress_bar.set_postfix({"examples": len(examples), "examples with query": examples_with_query})
examples = pd.DataFrame(examples, columns=["name", "command", "context"])
examples.to_csv(args.output, index=False)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Crawl examples from manpage-data.json")
parser.add_argument("input", type=str, help="path to manpage-data.json")
parser.add_argument("--chunk-size", type=int, default=100, help="number of lines in memory")
parser.add_argument("-o", "--output", type=str, default="manpage-examples.csv", help="path to output file")
args = parser.parse_args()
main(args)