init
This commit is contained in:
95
sentiment-analysis/test.py
Normal file
95
sentiment-analysis/test.py
Normal file
@@ -0,0 +1,95 @@
|
||||
import os
|
||||
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
|
||||
os.environ['HF_HOME'] = '/home/ht/huggingface/models'
|
||||
|
||||
from transformers import AutoTokenizer, AutoModelForSequenceClassification,TrainingArguments,Trainer
|
||||
from datasets import load_dataset
|
||||
|
||||
dataset_name = "imdb"
|
||||
task = "sentiment-analysis"
|
||||
dataset = load_dataset(dataset_name).shuffle()
|
||||
|
||||
|
||||
# 分割训练集和测试集
|
||||
train_dataset = dataset["train"].select(range(20000)) # 使用部分数据加速训练
|
||||
test_dataset = dataset["test"].select(range(5000))
|
||||
|
||||
model_name = "bert-base-cased"
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
|
||||
def preprocess_function(examples):
|
||||
return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
|
||||
|
||||
# 应用预处理
|
||||
tokenized_train = train_dataset.map(preprocess_function, batched=True)
|
||||
tokenized_test = test_dataset.map(preprocess_function, batched=True)
|
||||
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
"bert-base-cased",
|
||||
num_labels=2,
|
||||
id2label={0: "negative", 1: "positive"},
|
||||
label2id={"negative": 0, "positive": 1}
|
||||
)
|
||||
|
||||
# 设置训练参数
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
eval_strategy="epoch",
|
||||
learning_rate=2e-5,
|
||||
per_device_train_batch_size=8,
|
||||
per_device_eval_batch_size=8,
|
||||
num_train_epochs= 10,
|
||||
weight_decay=0.01,
|
||||
save_strategy="epoch",
|
||||
load_best_model_at_end=True,
|
||||
)
|
||||
|
||||
|
||||
# 定义评估指标
|
||||
import numpy as np
|
||||
from sklearn.metrics import accuracy_score
|
||||
|
||||
def compute_metrics(eval_pred):
|
||||
logits, labels = eval_pred
|
||||
predictions = np.argmax(logits, axis=-1)
|
||||
return {"accuracy": accuracy_score(labels, predictions)}
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=tokenized_train,
|
||||
eval_dataset=tokenized_test,
|
||||
compute_metrics=compute_metrics,
|
||||
)
|
||||
|
||||
trainer.train()
|
||||
|
||||
# 评估模型性能
|
||||
eval_results = trainer.evaluate()
|
||||
print(f"评估结果: {eval_results}")
|
||||
|
||||
# 保存微调后的模型
|
||||
model.save_pretrained("./fine_tuned_bert_imdb")
|
||||
tokenizer.save_pretrained("./fine_tuned_bert_imdb")
|
||||
|
||||
from transformers import pipeline
|
||||
|
||||
classifier = pipeline(
|
||||
"text-classification",
|
||||
model="./fine_tuned_bert_imdb",
|
||||
tokenizer="./fine_tuned_bert_imdb"
|
||||
)
|
||||
|
||||
# 示例预测
|
||||
print(classifier("This movie was fantastic! I loved every minute of it."))
|
||||
|
||||
|
||||
# data = dataset["train"]["text"][:10]
|
||||
# inputs = tokenizer(data, padding=True, truncation=True, return_tensors="pt")
|
||||
# outputs = model(**inputs)
|
||||
# predictions = outputs.logits.argmax(dim=-1)
|
||||
# labels = dataset["train"]["label"][:10]
|
||||
# for i ,(predictions,label) in enumerate(zip(predictions, labels)):
|
||||
# prediction_label = "positive" if predictions == 1 else "negative"
|
||||
# true_lable = "positive" if label == 1 else "negative"
|
||||
# print(f"Example {i+1}: Prediction: {prediction_label}, True Label: {true_lable}")
|
||||
Reference in New Issue
Block a user