Applications with Context Vectors – MachineLearningMastery.com


import numpy as np

import torch

from transformers import BertTokenizer, BertModel

 

def get_context_vectors(sentence, model, tokenizer):

    inputs = tokenizer(sentence, return_tensors=“pt”, add_special_tokens=True)

    input_ids = inputs[“input_ids”]

    attention_mask = inputs[“attention_mask”]

 

    # Get the tokens (for reference)

    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

 

    # Forward pass, get all hidden states from each layer

    with torch.no_grad():

        outputs = model(input_ids, attention_mask=attention_mask, output_hidden_states=True)

    hidden_states = outputs.hidden_states

 

    # Each element in hidden states has shape (batch_size, sequence_length, hidden_size)

    # Here takes the first element in the batch from the last layer

    last_layer_vectors = hidden_states[1][0].numpy()  # Shape: (sequence_length, hidden_size)

 

    return tokens, last_layer_vectors

 

def cosine_similarity(vec1, vec2):

    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

 

def extract_contextual_keywords(document, model, tokenizer, top_n=5):

    “”“extract contextual keywords from a document”“”

    # Split the document into sentences (simple split by period)

    sentences = [s.strip() for s in document.split(“.”) if s.strip()]

 

    # Process each sentence to get context vectors

    all_tokens = []

    all_vectors = []

    for sentence in sentences:

        if not sentence:

            continue   # Skip empty sentences

 

        # Get context vectors

        tokens, vectors = get_context_vectors(sentence, model, tokenizer)

 

        # Store tokens and vectors (excluding special tokens [CLS] and [SEP])

        all_tokens.extend(tokens[1:1])

        all_vectors.extend(vectors[1:1])

 

    # Convert to numpy arrays, then calculate the document vector as average of all token vectors

    all_vectors = np.array(all_vectors)

    doc_vector = np.mean(all_vectors, axis=0)

 

    # Calculate similarity between each token vector and the document vector

    similarities = []

    for token, vec in zip(all_tokens, all_vectors):

        # Skip special tokens, punctuation, and common words

        if token in [“[CLS]”, “[SEP]”, “.”, “,”, “!”, “?”, “the”, “a”, “an”, “is”, “are”, “was”, “were”]:

            continue

        # compute similarity, then remember it with the token

        sim = cosine_similarity(vec, doc_vector)

        similarities.append((sim, token))

 

    # Sort the similarity and get the top N

    top_similarities = sorted(similarities, reverse=True)[:top_n]

    return top_similarities

 

# Example document

document = “”

Artificial intelligence is transforming industries around the world.

Machine learning algorithms can analyze vast amounts of data to identify patterns and make predictions.

Natural language processing enables computers to understand and generate human language.

Computer vision systems can recognize objects and interpret visual information.

These technologies are driving innovation in healthcare, finance, transportation, and many other sectors.

“”

 

tokenizer = BertTokenizer.from_pretrained(“bert-base-uncased”)

model = BertModel.from_pretrained(“bert-base-uncased”)

model.eval()

 

# Extract contextual keywords and print the result

top_keywords = extract_contextual_keywords(document, model, tokenizer, top_n=10)

print(“Top contextual keywords:”)

for similarity, token in top_keywords:

    print(f“{token}: {similarity:.4f}”)

Recent Articles

Related Stories

Leave A Reply

Please enter your comment!
Please enter your name here