def generate_response_slm()

in agora/cerebral_api/src/app.py [0:0]


    def generate_response_slm(self, question: str, industry: str = None, role: str = None, sid: str = None) -> str:
        """Generate response using local model and retrieved context with improved socket handling."""
        try:
            if VERBOSE:
                logger.debug(f"Generating response for question: {question}")

            # Send initial status to keep connection alive
            if sid:
                socketio.emit('status', {'message': 'Searching relevant documents...'}, room=sid)

            # Get context with timeout
            try:
                context = self._get_context(question)
                
                # Send context immediately when found
                if sid:
                    socketio.emit('context', {'context': context}, room=sid)
                    socketio.emit('status', {'message': 'Generating response...'}, room=sid)
            except Exception as e:
                logger.error(f"Error retrieving context: {str(e)}")
                context = "Error retrieving context. Proceeding with general response."
                if sid:
                    socketio.emit('error', {'error': 'Context retrieval error, proceeding with general response'}, room=sid)

            # Format prompt
            prompt = self.prompt_template.format(
                context=context,
                question=question
            )

            # Keep connection alive during token generation
            last_update = time.time()
            update_interval = 2.0  # Send status update every 2 seconds

            # Generate response
            input_tokens = self.tokenizer.encode(prompt)
            params = og.GeneratorParams(self.model)
            params.set_search_options(**self.search_options)
            params.input_ids = input_tokens
            generator = og.Generator(self.model, params)

            try:
                generated_text = ""
                current_time = time.time()

                while not generator.is_done():
                    # Send keep-alive status periodically
                    if sid and (current_time - last_update) > update_interval:
                        socketio.emit('status', {'message': 'Still generating...'}, room=sid)
                        last_update = current_time

                    generator.compute_logits()
                    generator.generate_next_token()
                    
                    new_token = generator.get_next_tokens()[0]
                    token_text = self.tokenizer_stream.decode(new_token)
                    generated_text += token_text
                    
                    if sid:
                        socketio.emit('token', {'token': token_text}, room=sid)
                        # Small sleep to prevent overwhelming the socket
                        time.sleep(0.01)

                    current_time = time.time()

                if sid:
                    socketio.emit('complete', room=sid)
                
                if VERBOSE:
                    logger.debug(f"Generated response length: {len(generated_text)}")
                    logger.debug(f"Response preview: {generated_text[:200]}...")

                return generated_text

            finally:
                del generator

        except Exception as e:
            error_msg = f"Error generating response: {str(e)}"
            logger.error(error_msg)
            if sid:
                socketio.emit('error', {'error': error_msg}, room=sid)
            return error_msg