clean_and_create/load_data.py [72:106]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def is_valid_question_or_answer(text):
    if not text or text.strip() == "":
        return False

    # Define patterns that indicate code
    patterns = [
        r'\{.*?\}',  # Matches { ... }
        r'\[.*?\]',  # Matches [ ... ]
        r'<.*?>',    # Matches < ... >
        r'\b\d{1,3}(\.\d{1,3}){3}\b',  # Matches IP addresses
        r'\w+\.\w+',  # Matches word.word patterns
        r'\n\s*\n',  # Matches two consecutive newlines
        r'unanswerable',  # Matches 'unanswerable' regardless of case
        r'Q\d+: ',  # Contains other questions
        r'A\d+: ',  # Contains other answers
    ]
    return not any(re.search(pattern, text, re.IGNORECASE) for pattern in patterns)

# Function to process a single group
def process_group(key_group):
    try:
        key, group = key_group
        qa_pairs = []
        for _, row in group.iterrows():
            question = re.sub(r'^Q\d+: ', '', row['question'])
            answer = re.sub(r'^A\d+: ', '', row['answer'])
            if is_valid_question_or_answer(question) and is_valid_question_or_answer(answer):
                qa_pairs.append({
                    "user": question,
                    "assistant": answer,
                    "source": "PDFA key: " + str(row['__key__'])
                })
        if qa_pairs:
            return {
                "texts": qa_pairs,
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



create_only_with_pdfs/load_data.py [16:50]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def is_valid_question_or_answer(text):
    if not text or text.strip() == "":
        return False

    # Define patterns that indicate code
    patterns = [
        r'\{.*?\}',  # Matches { ... }
        r'\[.*?\]',  # Matches [ ... ]
        r'<.*?>',    # Matches < ... >
        r'\b\d{1,3}(\.\d{1,3}){3}\b',  # Matches IP addresses
        r'\w+\.\w+',  # Matches word.word patterns
        r'\n\s*\n',  # Matches two consecutive newlines
        r'unanswerable',  # Matches 'unanswerable' regardless of case
        r'Q\d+: ',  # Contains other questions
        r'A\d+: ',  # Contains other answers
    ]
    return not any(re.search(pattern, text, re.IGNORECASE) for pattern in patterns)

# Function to process a single group
def process_group(key_group):
    try:
        key, group = key_group
        qa_pairs = []
        for _, row in group.iterrows():
            question = re.sub(r'^Q\d+: ', '', row['question'])
            answer = re.sub(r'^A\d+: ', '', row['answer'])
            if is_valid_question_or_answer(question) and is_valid_question_or_answer(answer):
                qa_pairs.append({
                    "user": question,
                    "assistant": answer,
                    "source": "PDFA key: " + str(row['__key__'])
                })
        if qa_pairs:
            return {
                "texts": qa_pairs,
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



