# outputs: { 'text': [ 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'things really get weird , though not particularly scary : the movie is all portent and no content .' ], 'label': [1, 0] }
表征类文本分类(Text
Classification with Representation Models)
import numpy as np import pandas as pd from sklearn.metrics import classification_report from sklearn.metrics.pairwise import cosine_similarity
# Average the embeddings of all documents in each target label df = pd.DataFrame(np.hstack([train_embeddings, np.array(data["train"]["label"]).reshape(-1, 1)])) averaged_target_embeddings = df.groupby(768).mean().values
# Find the best matching embeddings between evaluation documents and target embeddings sim_matrix = cosine_similarity(test_embeddings, averaged_target_embeddings) y_pred = np.argmax(sim_matrix, axis=1)
# Evaluate the model evaluate_performance(data["test"]["label"], y_pred)
step2: 计算 test sample 中样本 embedding 和正负样本 embedding
的相似度,更接近的那个就是类似
# Create embeddings for our labels label_embeddings = model.encode(["A negative review", "A positive review"])
from sklearn.metrics.pairwise import cosine_similarity
# Find the best matching label for each document sim_matrix = cosine_similarity(test_embeddings, label_embeddings) y_pred = np.argmax(sim_matrix, axis=1)
# Run inference y_pred = [] for output in tqdm(pipe(KeyDataset(data["test"], "t5")), total=len(data["test"])): text = output[0]["generated_text"] y_pred.append(0if text == "negative"else1) evaluate_performance(data["test"]["label"], y_pred)