scripts/process_json/process_json.py [27:56]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    documents = []
    skipped_items = []
    # iterate over the data and create document objects
    for item in data:
        if len(documents) % 20 == 0:
            logger.info(f"Processed {len(documents)} documents")

        try:
            # get the id, text, source, source_id, url, created_at and author from the item
            # use default values if not specified
            id = item.get("id", None)
            text = item.get("text", None)
            source = item.get("source", None)
            source_id = item.get("source_id", None)
            url = item.get("url", None)
            created_at = item.get("created_at", None)
            author = item.get("author", None)

            if not text:
                logger.info("No document text, skipping...")
                continue

            # create a metadata object with the source, source_id, url, created_at and author
            metadata = DocumentMetadata(
                source=source,
                source_id=source_id,
                url=url,
                created_at=created_at,
                author=author,
            )
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



scripts/process_jsonl/process_jsonl.py [27:56]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    documents = []
    skipped_items = []
    # iterate over the data and create document objects
    for item in data:
        if len(documents) % 20 == 0:
            logger.info(f"Processed {len(documents)} documents")

        try:
            # get the id, text, source, source_id, url, created_at and author from the item
            # use default values if not specified
            id = item.get("id", None)
            text = item.get("text", None)
            source = item.get("source", None)
            source_id = item.get("source_id", None)
            url = item.get("url", None)
            created_at = item.get("created_at", None)
            author = item.get("author", None)

            if not text:
                logger.info("No document text, skipping...")
                continue

            # create a metadata object with the source, source_id, url, created_at and author
            metadata = DocumentMetadata(
                source=source,
                source_id=source_id,
                url=url,
                created_at=created_at,
                author=author,
            )
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



