Personal experiments in BERT
1. 여러가지 실험
KoBERT
- Cosine Similarity 를 fine-tuning 없이 해보기 - KoBERT
- fine tuning 없이 두개의 문장을 넣고 얼마나 일치하는지 알아본 실험
- 결론적으로는 fine tuning 은 반드시 필요하다
- Cosine Similarity 를 fine-tuning 없이 해보기 - KoELECTRA
2. Tokenizer Quick Reference
2.1 Sentencepiece Model
import sentencepiece as stp
tokenizer_path = '/tmp/kobert/kobert-news-wiki.spiece'
tokenizer = stp.SentencePieceProcessor(model_file=tokenizer_path)
print("Encode :", tokenizer.Encode("치킨은 맛있다"))
print("EncodeAsIds :", tokenizer.EncodeAsIds("치킨은 맛있다"))
print("EncodeAsPieces :", tokenizer.EncodeAsPieces("치킨은 맛있다"))
print("Decode :", tokenizer.Decode(tokenizer.Encode("치킨")))
Encode : [4617, 7576, 7086, 1967, 7143]
EncodeAsIds : [4617, 7576, 7086, 1967, 7143]
EncodeAsPieces : ['▁치', '킨', '은', '▁맛', '있다']
Decode : 치킨
2.2 GluonNLP - Sentencepiece
import gluonnlp as gnlp
tokenizer_path = '/tmp/kobert/kobert-news-wiki.spiece'
vocab = gnlp.vocab.BERTVocab.from_sentencepiece(
tokenizer_path, padding_token="[PAD]"
)
tokenizer = gnlp.data.BERTSPTokenizer(tokenizer_path, vocab, lower=False)
tokens = tokenizer("치킨은 맛있다")
ids = tokenizer.convert_tokens_to_ids(tokens)
decodes = vocab.to_tokens(ids)
print("EncodeAsPieces:", tokens)
print("EncodeAsIds :", ids)
print("Decode :", decodes)
EncodeAsPieces: ['▁치', '킨', '은', '▁맛', '있다']
EncodeAsIds : [4617, 7576, 7086, 1967, 7143]
Decode : ['▁치', '킨', '은', '▁맛', '있다']
3. Transform
3.1 GluonNLP - BERTSentenceTransform
transform = gnlp.data.BERTSentenceTransform(
tokenizer, # gluonnlp.data.transforms.BERTSPTokenizer
max_seq_length=64, # 문장의 길이
pad=True,
pair=False,
)
text = "하나님이 세상을 이처럼 사랑하사 독생자를 주셨으니 이는 그를 믿는 자마다 멸망하지 않고 영생을 얻게 하려 하심이라"
token_ids, valid_length, segment_ids = transform([text])
print('[token_ids]\n', token_ids)
print('\n[valid_length]\n', valid_length)
print('\n[segment_ids]\n', segment_ids)
print('\n[id -> token]\n', tokenizer.vocab.to_tokens(token_ids.tolist()))
[token_ids]
[ 2 4928 5778 7096 2812 3748 2590 7782 6493 1725 6542 7158 4213 6604
7076 3658 1185 6116 517 6266 5760 3886 6142 517 6202 6165 7819 3149
3376 6542 7088 517 6869 5400 517 7806 4924 6745 7101 3 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1]
[valid_length]
40
[segment_ids]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[id -> token]
['[CLS]', '▁하나', '님', '이', '▁세상을', '▁이처럼', '▁사랑', '하', '사', '▁독', '생', '자를', '▁주', '셨', '으니', '▁이는', '▁그', '를', '▁', '믿', '는', '▁자', '마다', '▁', '멸', '망', '하지', '▁않고', '▁영', '생', '을', '▁', '얻', '게', '▁', '하려', '▁하', '심', '이라', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
4. KoELECTRA
4.1 Encoder output 꺼내기
from transformers import ElectraForSequenceClassification
device = torch.device("cpu")
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator").to(device)
activation = {}
def get_activation(name):
def hook(model, input, output):
activation[name] = output.detach()
return hook
getattr(model.electra.encoder.layer, '11').output.register_forward_hook(get_activation('output'))