Applications with Context Vectors – MachineLearningMastery.com

import numpy as np

import torch

from transformers import BertTokenizer, BertModel

def get_context_vectors(sentence, model, tokenizer):

inputs = tokenizer(sentence, return_tensors=“pt”, add_special_tokens=True)

input_ids = inputs[“input_ids”]

attention_mask = inputs[“attention_mask”]

# Get the tokens (for reference)

tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

# Forward pass, get all hidden states from each layer

with torch.no_grad():

outputs = model(input_ids, attention_mask=attention_mask, output_hidden_states=True)

hidden_states = outputs.hidden_states

# Each element in hidden states has shape (batch_size, sequence_length, hidden_size)

# Here takes the first element in the batch from the last layer

last_layer_vectors = hidden_states[–1][0].numpy() # Shape: (sequence_length, hidden_size)

return tokens, last_layer_vectors

def cosine_similarity(vec1, vec2):

return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def extract_contextual_keywords(document, model, tokenizer, top_n=5):

“”“extract contextual keywords from a document”“”

# Split the document into sentences (simple split by period)

sentences = [s.strip() for s in document.split(“.”) if s.strip()]

# Process each sentence to get context vectors

all_tokens = []

all_vectors = []

for sentence in sentences:

if not sentence:

continue # Skip empty sentences

# Get context vectors

tokens, vectors = get_context_vectors(sentence, model, tokenizer)

# Store tokens and vectors (excluding special tokens [CLS] and [SEP])

all_tokens.extend(tokens[1:–1])

all_vectors.extend(vectors[1:–1])

# Convert to numpy arrays, then calculate the document vector as average of all token vectors

all_vectors = np.array(all_vectors)

doc_vector = np.mean(all_vectors, axis=0)

# Calculate similarity between each token vector and the document vector

similarities = []

for token, vec in zip(all_tokens, all_vectors):

# Skip special tokens, punctuation, and common words

if token in [“[CLS]”, “[SEP]”, “.”, “,”, “!”, “?”, “the”, “a”, “an”, “is”, “are”, “was”, “were”]:

continue

# compute similarity, then remember it with the token

sim = cosine_similarity(vec, doc_vector)

similarities.append((sim, token))

# Sort the similarity and get the top N

top_similarities = sorted(similarities, reverse=True)[:top_n]

return top_similarities

# Example document

document = “”“

Artificial intelligence is transforming industries around the world.

Machine learning algorithms can analyze vast amounts of data to identify patterns and make predictions.

Natural language processing enables computers to understand and generate human language.

Computer vision systems can recognize objects and interpret visual information.

These technologies are driving innovation in healthcare, finance, transportation, and many other sectors.

““”

tokenizer = BertTokenizer.from_pretrained(“bert-base-uncased”)

model = BertModel.from_pretrained(“bert-base-uncased”)

model.eval()

# Extract contextual keywords and print the result

top_keywords = extract_contextual_keywords(document, model, tokenizer, top_n=10)

print(“Top contextual keywords:”)

for similarity, token in top_keywords:

print(f“{token}: {similarity:.4f}”)

Applications with Context Vectors – MachineLearningMastery.com

Recent Articles

Bedrohungs-Monitoring: Die 10 besten Tools zur Darknet-Überwachung

7 “Useless” Python Standard Library Functions You Should Know

LLMs Can Now Solve Challenging Math Problems with Minimal Data: Researchers from UC Berkeley and Ai2 Unveil a Fine-Tuning Recipe That Unlocks Mathematical Reasoning...

ASUS Confirms Critical Flaw in AiCloud Routers; Users Urged to Update Firmware

NASA’s Lucy Spacecraft Set for High-Stakes Asteroid Encounter on Sunday

Related Stories

Leave A Reply Cancel reply