supporting-blog-content/plagiarism-detection-with-elasticsearch/plagiarism_detection_es.ipynb (434 lines of code) (raw):
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"source": [
"**Blog: Plagiarism detection with Elasticsearch**"
],
"metadata": {
"id": "kmMkWI9MH7SG"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Q9cqVF6lJtYw"
},
"outputs": [],
"source": [
"!pip install elasticsearch==8.11 #Elasticsearch"
]
},
{
"cell_type": "code",
"source": [
"pip -q install eland elasticsearch sentence_transformers transformers torch==2.1.0"
],
"metadata": {
"id": "wwi3NpszKa_U"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from elasticsearch import Elasticsearch, helpers\n",
"from elasticsearch.client import MlClient\n",
"from eland.ml.pytorch import PyTorchModel\n",
"from eland.ml.pytorch.transformers import TransformerModel\n",
"from urllib.request import urlopen\n",
"import json\n",
"from pathlib import Path\n",
"import getpass"
],
"metadata": {
"id": "8JSAt-uUKcix"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Found in the 'Manage Deployment' page\n",
"CLOUD_ID = getpass.getpass(\"Enter Elastic Cloud ID: \")\n",
"\n",
"# Password for the 'elastic' user generated by Elasticsearch\n",
"ELASTIC_PASSWORD = getpass.getpass(\"Enter Elastic password: \")\n",
"\n",
"# Create the client instance\n",
"client = Elasticsearch(\n",
" cloud_id=CLOUD_ID, basic_auth=(\"elastic\", ELASTIC_PASSWORD), request_timeout=3600\n",
")"
],
"metadata": {
"id": "ctmF7sNwKd5o"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Set the model name from Hugging Face and task type\n",
"# open ai detector model - developed by open ai https://github.com/openai/gpt-2-output-dataset/tree/master/detector\n",
"hf_model_id = \"roberta-base-openai-detector\"\n",
"tm = TransformerModel(model_id=hf_model_id, task_type=\"text_classification\")\n",
"\n",
"# set the modelID as it is named in Elasticsearch\n",
"es_model_id = tm.elasticsearch_model_id()\n",
"\n",
"# Download the model from Hugging Face\n",
"tmp_path = \"models\"\n",
"Path(tmp_path).mkdir(parents=True, exist_ok=True)\n",
"model_path, config, vocab_path = tm.save(tmp_path)\n",
"\n",
"# Load the model into Elasticsearch\n",
"ptm = PyTorchModel(client, es_model_id)\n",
"ptm.import_model(\n",
" model_path=model_path, config_path=None, vocab_path=vocab_path, config=config\n",
")\n",
"\n",
"# Start the model\n",
"s = MlClient.start_trained_model_deployment(client, model_id=es_model_id)\n",
"s.body"
],
"metadata": {
"id": "AXeDnvJWKfll"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Set the model name from Hugging Face and task type\n",
"# sentence-transformers model\n",
"hf_model_id = \"sentence-transformers/all-mpnet-base-v2\"\n",
"tm = TransformerModel(model_id=hf_model_id, task_type=\"text_embedding\")\n",
"\n",
"# set the modelID as it is named in Elasticsearch\n",
"es_model_id = tm.elasticsearch_model_id()\n",
"\n",
"# Download the model from Hugging Face\n",
"tmp_path = \"models\"\n",
"Path(tmp_path).mkdir(parents=True, exist_ok=True)\n",
"model_path, config, vocab_path = tm.save(tmp_path)\n",
"\n",
"# Load the model into Elasticsearch\n",
"ptm = PyTorchModel(client, es_model_id)\n",
"ptm.import_model(\n",
" model_path=model_path, config_path=None, vocab_path=vocab_path, config=config\n",
")\n",
"\n",
"# Start the model\n",
"s = MlClient.start_trained_model_deployment(client, model_id=es_model_id)\n",
"s.body"
],
"metadata": {
"id": "wFiJAVpBKjkP"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# source index\n",
"client.indices.create(\n",
" index=\"plagiarism-docs\",\n",
" mappings={\n",
" \"properties\": {\n",
" \"title\": {\"type\": \"text\", \"fields\": {\"keyword\": {\"type\": \"keyword\"}}},\n",
" \"abstract\": {\"type\": \"text\", \"fields\": {\"keyword\": {\"type\": \"keyword\"}}},\n",
" \"url\": {\"type\": \"keyword\"},\n",
" \"venue\": {\"type\": \"keyword\"},\n",
" \"year\": {\"type\": \"keyword\"},\n",
" }\n",
" },\n",
")"
],
"metadata": {
"id": "S-SNKitkKmHC"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# ingest pipeline\n",
"\n",
"client.ingest.put_pipeline(\n",
" id=\"plagiarism-checker-pipeline\",\n",
" processors=[\n",
" {\n",
" \"inference\": { # for ml models - to infer against the data that is being ingested in the pipeline\n",
" \"model_id\": \"roberta-base-openai-detector\", # text classification model id\n",
" \"target_field\": \"openai-detector\", # Target field for the inference results\n",
" \"field_map\": { # Maps the document field names to the known field names of the model.\n",
" \"abstract\": \"text_field\" # Field matching our configured trained model input. Typically for NLP models, the field name is text_field.\n",
" },\n",
" }\n",
" },\n",
" {\n",
" \"inference\": {\n",
" \"model_id\": \"sentence-transformers__all-mpnet-base-v2\", # text embedding model model id\n",
" \"target_field\": \"abstract_vector\", # Target field for the inference results\n",
" \"field_map\": { # Maps the document field names to the known field names of the model.\n",
" \"abstract\": \"text_field\" # Field matching our configured trained model input. Typically for NLP models, the field name is text_field.\n",
" },\n",
" }\n",
" },\n",
" ],\n",
")"
],
"metadata": {
"id": "XdxP1bJ2KocF"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"client.indices.create(\n",
" index=\"plagiarism-checker\",\n",
" mappings={\n",
" \"properties\": {\n",
" \"title\": {\"type\": \"text\", \"fields\": {\"keyword\": {\"type\": \"keyword\"}}},\n",
" \"abstract\": {\"type\": \"text\", \"fields\": {\"keyword\": {\"type\": \"keyword\"}}},\n",
" \"url\": {\"type\": \"keyword\"},\n",
" \"venue\": {\"type\": \"keyword\"},\n",
" \"year\": {\"type\": \"keyword\"},\n",
" \"abstract_vector.predicted_value\": { # Inference results field, target_field.predicted_value\n",
" \"type\": \"dense_vector\",\n",
" \"dims\": 768, # embedding_size\n",
" \"index\": \"true\",\n",
" \"similarity\": \"dot_product\", # When indexing vectors for approximate kNN search, you need to specify the similarity function for comparing the vectors.\n",
" },\n",
" }\n",
" },\n",
")"
],
"metadata": {
"id": "cN4KjsXKKyTu"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"url = \"https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/emnlp2016-2018.json\"\n",
"\n",
"# Send a request to the URL and get the response\n",
"response = urlopen(url)\n",
"\n",
"# Load the response data into a JSON object\n",
"data_json = json.loads(response.read())\n",
"\n",
"\n",
"def create_index_body(doc):\n",
" \"\"\"Generate the body for an Elasticsearch document.\"\"\"\n",
" return {\n",
" \"_index\": \"plagiarism-docs\",\n",
" \"_source\": doc,\n",
" }\n",
"\n",
"\n",
"# Prepare the documents to be indexed\n",
"documents = [create_index_body(doc) for doc in data_json]\n",
"\n",
"# Use helpers.bulk to index\n",
"helpers.bulk(client, documents)\n",
"\n",
"print(\"Done indexing documents into `plagiarism-docs` source index\")"
],
"metadata": {
"id": "svjGh_hUK136"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# reindex with ingest pipeline\n",
"\n",
"client.reindex(\n",
" wait_for_completion=True,\n",
" source={\"index\": \"plagiarism-docs\"},\n",
" dest={\"index\": \"plagiarism-checker\", \"pipeline\": \"plagiarism-checker-pipeline\"},\n",
")"
],
"metadata": {
"id": "_lHg7p6SK5Ws"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# duplicated text - direct plagiarism\n",
"\n",
"model_text = \"Understanding and reasoning about cooking recipes is a fruitful research direction towards enabling machines to interpret procedural text. In this work, we introduce RecipeQA, a dataset for multimodal comprehension of cooking recipes. It comprises of approximately 20K instructional recipes with multiple modalities such as titles, descriptions and aligned set of images. With over 36K automatically generated question-answer pairs, we design a set of comprehension and reasoning tasks that require joint understanding of images and text, capturing the temporal flow of events and making sense of procedural knowledge. Our preliminary results indicate that RecipeQA will serve as a challenging test bed and an ideal benchmark for evaluating machine comprehension systems. The data and leaderboard are available at http://hucvl.github.io/recipeqa.\"\n",
"\n",
"response = client.search(\n",
" index=\"plagiarism-checker\",\n",
" size=1,\n",
" knn={\n",
" \"field\": \"abstract_vector.predicted_value\",\n",
" \"k\": 9,\n",
" \"num_candidates\": 974,\n",
" \"query_vector_builder\": {\n",
" \"text_embedding\": {\n",
" \"model_id\": \"sentence-transformers__all-mpnet-base-v2\",\n",
" \"model_text\": model_text,\n",
" }\n",
" },\n",
" },\n",
")\n",
"\n",
"for hit in response[\"hits\"][\"hits\"]:\n",
" score = hit[\"_score\"]\n",
" title = hit[\"_source\"][\"title\"]\n",
" abstract = hit[\"_source\"][\"abstract\"]\n",
" openai = hit[\"_source\"][\"openai-detector\"][\"predicted_value\"]\n",
" url = hit[\"_source\"][\"url\"]\n",
"\n",
" if score > 0.9:\n",
" print(f\"\\nHigh similarity detected! This might be plagiarism.\")\n",
" print(\n",
" f\"\\nMost similar document: '{title}'\\n\\nAbstract: {abstract}\\n\\nurl: {url}\\n\\nScore:{score}\\n\"\n",
" )\n",
"\n",
" if openai == \"Fake\":\n",
" print(\"This document may have been created by AI.\\n\")\n",
"\n",
" elif score < 0.7:\n",
" print(f\"\\nLow similarity detected. This might not be plagiarism.\")\n",
"\n",
" if openai == \"Fake\":\n",
" print(\"This document may have been created by AI.\\n\")\n",
"\n",
" else:\n",
" print(f\"\\nModerate similarity detected.\")\n",
" print(\n",
" f\"\\nMost similar document: '{title}'\\n\\nAbstract: {abstract}\\n\\nurl: {url}\\n\\nScore:{score}\\n\"\n",
" )\n",
"\n",
" if openai == \"Fake\":\n",
" print(\"This document may have been created by AI.\\n\")\n",
"\n",
"ml_client = MlClient(client)\n",
"\n",
"model_id = \"roberta-base-openai-detector\" # open ai text classification model\n",
"\n",
"document = [{\"text_field\": model_text}]\n",
"\n",
"ml_response = ml_client.infer_trained_model(model_id=model_id, docs=document)\n",
"\n",
"predicted_value = ml_response[\"inference_results\"][0][\"predicted_value\"]\n",
"\n",
"if predicted_value == \"Fake\":\n",
" print(\"Note: The text query you entered may have been generated by AI.\\n\")"
],
"metadata": {
"id": "51Tjohr8K-tW"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# similar text - paraphrase plagiarism\n",
"\n",
"model_text = \"Comprehending and deducing information from culinary instructions represents a promising avenue for research aimed at empowering artificial intelligence to decipher step-by-step text. In this study, we present CuisineInquiry, a database for the multifaceted understanding of cooking guidelines. It encompasses a substantial number of informative recipes featuring various elements such as headings, explanations, and a matched assortment of visuals. Utilizing an extensive set of automatically crafted question-answer pairings, we formulate a series of tasks focusing on understanding and logic that necessitate a combined interpretation of visuals and written content. This involves capturing the sequential progression of events and extracting meaning from procedural expertise. Our initial findings suggest that CuisineInquiry is poised to function as a demanding experimental platform.\"\n",
"\n",
"response = client.search(\n",
" index=\"plagiarism-checker\",\n",
" size=1,\n",
" knn={\n",
" \"field\": \"abstract_vector.predicted_value\",\n",
" \"k\": 9,\n",
" \"num_candidates\": 974,\n",
" \"query_vector_builder\": {\n",
" \"text_embedding\": {\n",
" \"model_id\": \"sentence-transformers__all-mpnet-base-v2\",\n",
" \"model_text\": model_text,\n",
" }\n",
" },\n",
" },\n",
")\n",
"\n",
"for hit in response[\"hits\"][\"hits\"]:\n",
" score = hit[\"_score\"]\n",
" title = hit[\"_source\"][\"title\"]\n",
" abstract = hit[\"_source\"][\"abstract\"]\n",
" openai = hit[\"_source\"][\"openai-detector\"][\"predicted_value\"]\n",
" url = hit[\"_source\"][\"url\"]\n",
"\n",
" if score > 0.9:\n",
" print(f\"\\nHigh similarity detected! This might be plagiarism.\")\n",
" print(\n",
" f\"\\nMost similar document: '{title}'\\n\\nAbstract: {abstract}\\n\\nurl: {url}\\n\\nScore:{score}\\n\"\n",
" )\n",
"\n",
" if openai == \"Fake\":\n",
" print(\"This document may have been created by AI.\\n\")\n",
"\n",
" elif score < 0.7:\n",
" print(f\"\\nLow similarity detected. This might not be plagiarism.\")\n",
"\n",
" if openai == \"Fake\":\n",
" print(\"This document may have been created by AI.\\n\")\n",
"\n",
" else:\n",
" print(f\"\\nModerate similarity detected.\")\n",
" print(\n",
" f\"\\nMost similar document: '{title}'\\n\\nAbstract: {abstract}\\n\\nurl: {url}\\n\\nScore:{score}\\n\"\n",
" )\n",
"\n",
" if openai == \"Fake\":\n",
" print(\"This document may have been created by AI.\\n\")\n",
"\n",
"ml_client = MlClient(client)\n",
"\n",
"model_id = \"roberta-base-openai-detector\" # open ai text classification model\n",
"\n",
"document = [{\"text_field\": model_text}]\n",
"\n",
"ml_response = ml_client.infer_trained_model(model_id=model_id, docs=document)\n",
"\n",
"predicted_value = ml_response[\"inference_results\"][0][\"predicted_value\"]\n",
"\n",
"if predicted_value == \"Fake\":\n",
" print(\"Note: The text query you entered may have been generated by AI.\\n\")"
],
"metadata": {
"id": "XcYCPXM0LAT3"
},
"execution_count": null,
"outputs": []
}
]
}