from functools import lru_cache
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
Â
Â
class AutoComplete:
    def __init__(self, model_name=“gpt2”):
        “”“Initialize the auto-complete system.”“”
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name, padding_side=“left”)
        self.model = GPT2LMHeadModel.from_pretrained(model_name)
        self.device = “cuda” if torch.cuda.is_available() else “cpu”
        self.model.to(self.device)
        self.model.eval()  # Set to evaluation mode
Â
    def get_completion(self, text, max_length=50):
        “”“Generate completion for the input text.”“”
        print(“**** Completion:”, text)
        # Encode the input text
        inputs = self.tokenizer(text, add_special_tokens=False, return_tensors=“pt”)
        input_ids = inputs[“input_ids”].to(self.device)
        attn_masks = inputs[“attention_mask”].to(self.device)
Â
        # Generate completion
        with torch.no_grad():
            outputs = self.model.generate(
                input_ids,
                attention_mask=attn_masks,
                max_length=max_length,
                num_return_sequences=1,
                pad_token_id=self.tokenizer.eos_token_id,
                do_sample=True,
                temperature=0.7
            )
Â
        # Decode and extract completion
        full_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        completion = full_text[len(text):]
Â
        return completion
Â
Â
class CachedAutoComplete(AutoComplete):
    def __init__(self, cache_size=1000, **kwargs):
        “”“Initialize with caching support.”“”
        super().__init__(**kwargs)
        self.get_completion = lru_cache(maxsize=cache_size)(
            self.get_completion
        )
Â
Â
class OptimizedAutoComplete(CachedAutoComplete):
    def __init__(self, **kwargs):
        “”“Initialize with optimizations.”“”
        super().__init__(**kwargs)
        self.tokenizer.pad_token = self.tokenizer.eos_token
Â
        if self.device == “cuda”:
            self.model = self.model.half()  # Use FP16 on GPU
Â
        # use eval mode and cuda graphs
        self.model.eval()
Â
    def preprocess_batch(self, texts):
        “”“Efficiently process multiple texts.”“”
        # Tokenize all texts at once
        inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors=“pt”)
        return inputs.to(self.device)
Â
    def generate_batch(self, texts, max_length=50):
        “”“Generate completions for multiple texts.”“”
        # Preprocess batch
        inputs = self.preprocess_batch(texts)
Â
        # Generate completions
        with torch.no_grad():
            outputs = self.model.generate(
                inputs[“input_ids”],
                attention_mask=inputs[“attention_mask”],
                max_length=max_length,
                num_return_sequences=1,
                pad_token_id=self.tokenizer.eos_token_id,
                do_sample=True,
                temperature=0.7
            )
Â
        # Decode completions
        completions = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
Â
        # Extract new text
        results = []
        for text, completion in zip(texts, completions):
            results.append(completion[len(text):])
Â
        return results
Â
# Example: Optimized batch completion
optimized_complete = OptimizedAutoComplete()
texts = [
    “Machine learning is”,
    “Deep neural networks can”,
    “The training process involves”
]
completions = optimized_complete.generate_batch(texts)
for text, completion in zip(texts, completions):
    print(f“\nInput: {text}”)
    print(f“Completion: {completion}”)