In [None]:
!pip install "ray==2.8.1"
!pip install "ray[serve]" 
!pip install requests 
!pip install transformers 
!pip install langchain
!pip install torch

In [None]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import AIMessage, HumanMessage, SystemMessage
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain import PromptTemplate



template1 = """Give me a fact about {topic}. """
template2 = "Translate to french: {fact}"

# create the prompt
prompt = PromptTemplate(
    input_variables=["topic"],
    template=template1,
)

# create the second prompt
second_prompt = PromptTemplate(
    input_variables=["fact"],
    template=template2,
)

def create_chains (llm):
    # create two chains 
    fact_chain = LLMChain(llm=llm, prompt=prompt)
    translate_chain = LLMChain(llm=llm, prompt=second_prompt)

    return fact_chain, translate_chain

In [None]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig, pipeline

model_id = 'google/flan-t5-small'

config = AutoConfig.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, config=config)

_pipeline = pipeline('text2text-generation',
                model=model,
                tokenizer=tokenizer,
                max_length = 512
                )

llm = HuggingFacePipeline(pipeline = _pipeline)

fact_chain, translate_chain = create_chains(llm)

In [None]:
# Run the chain specifying only the input variable for the first chain.
fact = fact_chain.run("birds")
translation = translate_chain.run(fact)
print (fact)
print (translation)

In [None]:
import ray

# initialize ray
ray.init(
    address="ray://ray-cluster-kuberay-head-svc:10001",
    runtime_env={
        "pip": [
            "transformers>=4.26.0",
            "langchain",
            "requests",
            "torch"
        ]
    }
)

In [None]:
from ray import serve
from starlette.requests import Request
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig, pipeline


@serve.deployment(ray_actor_options={"num_gpus": 1})
class DeployLLM:
    def __init__(self):
        model_id = 'google/flan-t5-small'
        config = AutoConfig.from_pretrained(model_id)
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_id, config=config)
        _pipeline = pipeline('text2text-generation',
                model=model,
                tokenizer=tokenizer,
                max_length = 512
                )

        llm = HuggingFacePipeline(pipeline = _pipeline)
        self.fact_chain, self.translate_chain = create_chains(llm)

    def _run_chain(self, text: str):
        fact = self.fact_chain.run(text)
        translation = self.translate_chain.run(fact)
        return fact, translation

    async def __call__(self, request: Request):
        # 1. Parse the request
        text = request.query_params["text"]
        # 2. Run the chain
        fact, translation = self._run_chain(text)
        # 3. Return the response
        return [fact, translation]

In [None]:
# Bind the model to deployment
deployment = DeployLLM.bind()

In [None]:
serve.run(deployment, host="0.0.0.0")

In [None]:
import requests

query = "bunny"
response = requests.post(f'http://ray-cluster-kuberay-head-svc:8000/?text={query}')
print(response.text)