**Blog: Plagiarism detection with Elasticsearch**

In [None]:
!pip install elasticsearch==8.11 #Elasticsearch

In [None]:
pip -q install eland elasticsearch sentence_transformers transformers torch==2.1.0

In [None]:
from elasticsearch import Elasticsearch, helpers
from elasticsearch.client import MlClient
from eland.ml.pytorch import PyTorchModel
from eland.ml.pytorch.transformers import TransformerModel
from urllib.request import urlopen
import json
from pathlib import Path
import getpass

In [None]:
# Found in the 'Manage Deployment' page
CLOUD_ID = getpass.getpass("Enter Elastic Cloud ID:  ")

# Password for the 'elastic' user generated by Elasticsearch
ELASTIC_PASSWORD = getpass.getpass("Enter Elastic password:  ")

# Create the client instance
client = Elasticsearch(
    cloud_id=CLOUD_ID, basic_auth=("elastic", ELASTIC_PASSWORD), request_timeout=3600
)

In [None]:
# Set the model name from Hugging Face and task type
# open ai detector model - developed by open ai https://github.com/openai/gpt-2-output-dataset/tree/master/detector
hf_model_id = "roberta-base-openai-detector"
tm = TransformerModel(model_id=hf_model_id, task_type="text_classification")

# set the modelID as it is named in Elasticsearch
es_model_id = tm.elasticsearch_model_id()

# Download the model from Hugging Face
tmp_path = "models"
Path(tmp_path).mkdir(parents=True, exist_ok=True)
model_path, config, vocab_path = tm.save(tmp_path)

# Load the model into Elasticsearch
ptm = PyTorchModel(client, es_model_id)
ptm.import_model(
    model_path=model_path, config_path=None, vocab_path=vocab_path, config=config
)

# Start the model
s = MlClient.start_trained_model_deployment(client, model_id=es_model_id)
s.body

In [None]:
# Set the model name from Hugging Face and task type
# sentence-transformers model
hf_model_id = "sentence-transformers/all-mpnet-base-v2"
tm = TransformerModel(model_id=hf_model_id, task_type="text_embedding")

# set the modelID as it is named in Elasticsearch
es_model_id = tm.elasticsearch_model_id()

# Download the model from Hugging Face
tmp_path = "models"
Path(tmp_path).mkdir(parents=True, exist_ok=True)
model_path, config, vocab_path = tm.save(tmp_path)

# Load the model into Elasticsearch
ptm = PyTorchModel(client, es_model_id)
ptm.import_model(
    model_path=model_path, config_path=None, vocab_path=vocab_path, config=config
)

# Start the model
s = MlClient.start_trained_model_deployment(client, model_id=es_model_id)
s.body

In [None]:
# source index
client.indices.create(
    index="plagiarism-docs",
    mappings={
        "properties": {
            "title": {"type": "text", "fields": {"keyword": {"type": "keyword"}}},
            "abstract": {"type": "text", "fields": {"keyword": {"type": "keyword"}}},
            "url": {"type": "keyword"},
            "venue": {"type": "keyword"},
            "year": {"type": "keyword"},
        }
    },
)

In [None]:
# ingest pipeline

client.ingest.put_pipeline(
    id="plagiarism-checker-pipeline",
    processors=[
        {
            "inference": {  # for ml models - to infer against the data that is being ingested in the pipeline
                "model_id": "roberta-base-openai-detector",  # text classification model id
                "target_field": "openai-detector",  # Target field for the inference results
                "field_map": {  # Maps the document field names to the known field names of the model.
                    "abstract": "text_field"  # Field matching our configured trained model input. Typically for NLP models, the field name is text_field.
                },
            }
        },
        {
            "inference": {
                "model_id": "sentence-transformers__all-mpnet-base-v2",  # text embedding model model id
                "target_field": "abstract_vector",  # Target field for the inference results
                "field_map": {  # Maps the document field names to the known field names of the model.
                    "abstract": "text_field"  # Field matching our configured trained model input. Typically for NLP models, the field name is text_field.
                },
            }
        },
    ],
)

In [None]:
client.indices.create(
    index="plagiarism-checker",
    mappings={
        "properties": {
            "title": {"type": "text", "fields": {"keyword": {"type": "keyword"}}},
            "abstract": {"type": "text", "fields": {"keyword": {"type": "keyword"}}},
            "url": {"type": "keyword"},
            "venue": {"type": "keyword"},
            "year": {"type": "keyword"},
            "abstract_vector.predicted_value": {  # Inference results field, target_field.predicted_value
                "type": "dense_vector",
                "dims": 768,  # embedding_size
                "index": "true",
                "similarity": "dot_product",  #  When indexing vectors for approximate kNN search, you need to specify the similarity function for comparing the vectors.
            },
        }
    },
)

In [None]:
url = "https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/emnlp2016-2018.json"

# Send a request to the URL and get the response
response = urlopen(url)

# Load the response data into a JSON object
data_json = json.loads(response.read())


def create_index_body(doc):
    """Generate the body for an Elasticsearch document."""
    return {
        "_index": "plagiarism-docs",
        "_source": doc,
    }


# Prepare the documents to be indexed
documents = [create_index_body(doc) for doc in data_json]

# Use helpers.bulk to index
helpers.bulk(client, documents)

print("Done indexing documents into `plagiarism-docs` source index")

In [None]:
# reindex with ingest pipeline

client.reindex(
    wait_for_completion=True,
    source={"index": "plagiarism-docs"},
    dest={"index": "plagiarism-checker", "pipeline": "plagiarism-checker-pipeline"},
)

In [None]:
# duplicated text - direct plagiarism

model_text = "Understanding and reasoning about cooking recipes is a fruitful research direction towards enabling machines to interpret procedural text. In this work, we introduce RecipeQA, a dataset for multimodal comprehension of cooking recipes. It comprises of approximately 20K instructional recipes with multiple modalities such as titles, descriptions and aligned set of images. With over 36K automatically generated question-answer pairs, we design a set of comprehension and reasoning tasks that require joint understanding of images and text, capturing the temporal flow of events and making sense of procedural knowledge. Our preliminary results indicate that RecipeQA will serve as a challenging test bed and an ideal benchmark for evaluating machine comprehension systems. The data and leaderboard are available at http://hucvl.github.io/recipeqa."

response = client.search(
    index="plagiarism-checker",
    size=1,
    knn={
        "field": "abstract_vector.predicted_value",
        "k": 9,
        "num_candidates": 974,
        "query_vector_builder": {
            "text_embedding": {
                "model_id": "sentence-transformers__all-mpnet-base-v2",
                "model_text": model_text,
            }
        },
    },
)

for hit in response["hits"]["hits"]:
    score = hit["_score"]
    title = hit["_source"]["title"]
    abstract = hit["_source"]["abstract"]
    openai = hit["_source"]["openai-detector"]["predicted_value"]
    url = hit["_source"]["url"]

    if score > 0.9:
        print(f"\nHigh similarity detected! This might be plagiarism.")
        print(
            f"\nMost similar document: '{title}'\n\nAbstract: {abstract}\n\nurl: {url}\n\nScore:{score}\n"
        )

        if openai == "Fake":
            print("This document may have been created by AI.\n")

    elif score < 0.7:
        print(f"\nLow similarity detected. This might not be plagiarism.")

        if openai == "Fake":
            print("This document may have been created by AI.\n")

    else:
        print(f"\nModerate similarity detected.")
        print(
            f"\nMost similar document: '{title}'\n\nAbstract: {abstract}\n\nurl: {url}\n\nScore:{score}\n"
        )

        if openai == "Fake":
            print("This document may have been created by AI.\n")

ml_client = MlClient(client)

model_id = "roberta-base-openai-detector"  # open ai text classification model

document = [{"text_field": model_text}]

ml_response = ml_client.infer_trained_model(model_id=model_id, docs=document)

predicted_value = ml_response["inference_results"][0]["predicted_value"]

if predicted_value == "Fake":
    print("Note: The text query you entered may have been generated by AI.\n")

In [None]:
# similar text - paraphrase plagiarism

model_text = "Comprehending and deducing information from culinary instructions represents a promising avenue for research aimed at empowering artificial intelligence to decipher step-by-step text. In this study, we present CuisineInquiry, a database for the multifaceted understanding of cooking guidelines. It encompasses a substantial number of informative recipes featuring various elements such as headings, explanations, and a matched assortment of visuals. Utilizing an extensive set of automatically crafted question-answer pairings, we formulate a series of tasks focusing on understanding and logic that necessitate a combined interpretation of visuals and written content. This involves capturing the sequential progression of events and extracting meaning from procedural expertise. Our initial findings suggest that CuisineInquiry is poised to function as a demanding experimental platform."

response = client.search(
    index="plagiarism-checker",
    size=1,
    knn={
        "field": "abstract_vector.predicted_value",
        "k": 9,
        "num_candidates": 974,
        "query_vector_builder": {
            "text_embedding": {
                "model_id": "sentence-transformers__all-mpnet-base-v2",
                "model_text": model_text,
            }
        },
    },
)

for hit in response["hits"]["hits"]:
    score = hit["_score"]
    title = hit["_source"]["title"]
    abstract = hit["_source"]["abstract"]
    openai = hit["_source"]["openai-detector"]["predicted_value"]
    url = hit["_source"]["url"]

    if score > 0.9:
        print(f"\nHigh similarity detected! This might be plagiarism.")
        print(
            f"\nMost similar document: '{title}'\n\nAbstract: {abstract}\n\nurl: {url}\n\nScore:{score}\n"
        )

        if openai == "Fake":
            print("This document may have been created by AI.\n")

    elif score < 0.7:
        print(f"\nLow similarity detected. This might not be plagiarism.")

        if openai == "Fake":
            print("This document may have been created by AI.\n")

    else:
        print(f"\nModerate similarity detected.")
        print(
            f"\nMost similar document: '{title}'\n\nAbstract: {abstract}\n\nurl: {url}\n\nScore:{score}\n"
        )

        if openai == "Fake":
            print("This document may have been created by AI.\n")

ml_client = MlClient(client)

model_id = "roberta-base-openai-detector"  # open ai text classification model

document = [{"text_field": model_text}]

ml_response = ml_client.infer_trained_model(model_id=model_id, docs=document)

predicted_value = ml_response["inference_results"][0]["predicted_value"]

if predicted_value == "Fake":
    print("Note: The text query you entered may have been generated by AI.\n")