bert.py

"""BERT NER Inference.""" 

from __future__ import absolute_import, division, print_function
import numpy as np
import json
import os
import nltk
from keras.models import load_model
import torch
import torch.nn.functional as F
from nltk import word_tokenize
from pytorch_pretrained_bert.modeling import (CONFIG_NAME, WEIGHTS_NAME,
                                              BertConfig,
                                              BertForTokenClassification)
from pytorch_pretrained_bert.tokenization import BertTokenizer

np.random.seed(1337)

idx2tag={0: 'other/scientific',
 1: 'person/artist',
 2: 'person/title',
 3: 'organization/sports_team',
 4: 'organization/company',
 5: 'other/supernatural',
 6: 'other/product',
 7: 'other/religion',
 8: 'location/city',
 9: 'other/event',
 10: 'location/country',
 11: 'location/geography',
 12: 'other/living_thing',
 13: 'person/political_figure',
 14: 'other/internet',
 15: 'other/award',
 16: 'person/athlete',
 17: 'organization/education',
 18: 'other/art',
 19: 'other/health',
 20: 'other/body_part',
 21: 'person/religious_leader',
 22: 'other/language',
 23: 'location/structure',
 24: 'organization/political_party',
 25: 'organization/military',
 26: 'other/currency',
 27: 'organization/music',
 28: 'other/legal',
 29: 'location/geograpy',
 30: 'location/celestial',
 31: 'other/heritage',
 32: 'organization/government',
 33: 'other/food',
 34: 'organization/stock_exchange',
 35: 'organization/transit',
 36: 'other/sports_and_leisure',
 37: 'person/military',
 38: 'organization/sports_league',
 39: 'location/transit',
 40: 'person/legal',
 41: 'location/park',
 42: 'person/doctor',
 43: 'person/coach'}

class BertNer(BertForTokenClassification):
   
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, valid_ids=None):
        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
        batch_size,max_len,feat_dim = sequence_output.shape
        valid_output = torch.zeros(batch_size,max_len,feat_dim,dtype=torch.float32)
        for i in range(batch_size):
            jj = -1
            for j in range(max_len):
                    if valid_ids[i][j].item() == 1:
                        jj += 1
                        valid_output[i][jj] = sequence_output[i][j]
        sequence_output = self.dropout(valid_output)
        logits = self.classifier(sequence_output)
        return logits

class Ner:

    def __init__(self,model_dir: str):
        self.model , self.tokenizer, self.model_config = self.load_model(model_dir)
        self.label_map = self.model_config["label_map"]
        self.max_seq_length = self.model_config["max_seq_length"]
        self.label_map = {int(k):v for k,v in self.label_map.items()}
        self.model.eval()

    def load_model(self, model_dir: str, model_config: str = "model_config.json"):
        model_config = os.path.join(model_dir,model_config)
        model_config = json.load(open(model_config))
        output_config_file = os.path.join(model_dir, CONFIG_NAME)
        output_model_file = os.path.join(model_dir, WEIGHTS_NAME)
        config = BertConfig(output_config_file)
        model = BertNer(config, num_labels=model_config["num_labels"])
        model.load_state_dict(torch.load(output_model_file))
        tokenizer = BertTokenizer.from_pretrained(model_config["bert_model"],do_lower_case=False)
        return model, tokenizer, model_config

    def tokenize(self, text: str):
        """ tokenize input"""
        words = text.split()
        #print(words)
        tokens = []
        valid_positions = []
        for i,word in enumerate(words):
            token = self.tokenizer.tokenize(word)
            tokens.extend(token)
            for i in range(len(token)):
                if i == 0:
                    valid_positions.append(1)
                else:
                    valid_positions.append(0)
        return tokens, valid_positions

    def preprocess(self, text: str):
        """ preprocess """
        tokens, valid_positions = self.tokenize(text)
        ## insert "[CLS]"
        tokens.insert(0,"[CLS]")
        valid_positions.insert(0,1)
        ## insert "[SEP]"
        tokens.append("[SEP]")
        valid_positions.append(1)
        segment_ids = []
        for i in range(len(tokens)):
            segment_ids.append(0)
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)
        while len(input_ids) < self.max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)
            valid_positions.append(0)
        return input_ids,input_mask,segment_ids,valid_positions,tokens
   #added function 
    
    def get_bert_embedding(self, textlist,tags,file_name,model_dir='out/',model_config="model_config.json"):

        
        model_config = os.path.join(model_dir,model_config)
        model_config = json.load(open(model_config))
        output_config_file = os.path.join(model_dir, CONFIG_NAME)
        output_model_file = os.path.join(model_dir, WEIGHTS_NAME)
        config = BertConfig(output_config_file)
        model = BertNer(config, num_labels=model_config["num_labels"])
        dict_embeddings={}
        #token2tag={}
        n_txt=0
        for text in textlist:
            #print(n_txt)
            _,logits=self.predict(text)
#            print(log#its.size())
            if n_txt%1000==0:
                print('{}/{}'.format(n_txt,len(textlist)))
            input_ids,input_mask,segment_ids,valid_ids,tokens = self.preprocess(text)
            input_ids = torch.tensor([input_ids],dtype=torch.long)
            input_mask = torch.tensor([input_mask],dtype=torch.long)
            segment_ids = torch.tensor([segment_ids],dtype=torch.long)
            valid_ids = torch.tensor([valid_ids],dtype=torch.long)
            bert_embed,_ = model.bert(input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=False)
            tensor_idx=0
            #print(valid_ids)
            #print(tokens)
            #print(tags[n_txt])
            #print('boucle 2')
            #print(valid_ids)


            for i in range(len(tokens)):
                
                #print(tensor_idx-1)
                if (valid_ids[0][i]!=0) and (tokens[i]!='[CLS]') and (tokens[i]!='[SEP]'):
                    tensor_idx+=1
                    #print(tensor_idx-1)
                    #print(tokens[i])
                    l_tag=tags[n_txt][tensor_idx-1].split("/")
                    if len(l_tag)>2:
                        tag=l_tag[1]+'/'+l_tag[2]
                        if (tag in dict_embeddings):
                        #token2embedding[tokens[i]]=average2list(bert_embed[0,tensor_idx,:].tolist(),token2embedding[tokens[i]])
                        #print('if\n'+tags[n_txt][tensor_idx-1])
                        #if tags[n_txt][tensor_idx-1] not in token2tag[tokens[i]]:
                         #   token2tag[tokens[i]].append(tags[n_txt][tensor_idx-1])

                            dict_embeddings[tag].append(bert_embed[0,tensor_idx,:].tolist()+logits[0,tensor_idx,1:9].tolist())
                        #dict_embeddings[tags[n_txt][tensor_idx-1]].append(bert_embed[0,tensor_idx,:].tolist())
                        elif (tag not in dict_embeddings):
                        #token2embedding[tokens[i]]=bert_embed[0,tensor_idx,:].tolist()
                        #print('else\n'+tags[n_txt][tensor_idx-1])
                        #token2tag[tokens[i]]=[tags[n_txt][tensor_idx-1]]
                            dict_embeddings[tag]=[bert_embed[0,tensor_idx,:].tolist()+logits[0,tensor_idx,1:9].tolist()]

            n_txt+=1           
        with open(file_name, 'w') as f:
            json.dump(dict_embeddings, f)

        return dict_embeddings

    def predict(self, text: str,ok=False):
        input_ids,input_mask,segment_ids,valid_ids,tokens = self.preprocess(text)
        input_ids = torch.tensor([input_ids],dtype=torch.long)
        input_mask = torch.tensor([input_mask],dtype=torch.long)
        segment_ids = torch.tensor([segment_ids],dtype=torch.long)
        valid_ids = torch.tensor([valid_ids],dtype=torch.long)

        model_dir='out/'
        model_config="model_config.json"

        model_config = os.path.join(model_dir,model_config)
        model_config = json.load(open(model_config))
        output_config_file = os.path.join(model_dir, CONFIG_NAME)
        output_model_file = os.path.join(model_dir, WEIGHTS_NAME)
        config = BertConfig(output_config_file)
        model = BertNer(config, num_labels=model_config["num_labels"])
        bert_embed,_ = model.bert(input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=False)

#print(tokens)
        #print(valid_ids)
        #print(input_ids)
        #print(input_mask)
        with torch.no_grad():
            logit = self.model(input_ids, segment_ids, input_mask,valid_ids)
        logits = F.softmax(logit,dim=2)
        logits_label = torch.argmax(logits,dim=2)
        logits_label = logits_label.detach().cpu().numpy().tolist()[0]
        #print(logits_label)
        # import ipdb; ipdb.set_trace()
        logits_confidence = [values[label].item() for values,label in zip(logits[0],logits_label)]
        
        logits = []
        pos = 0
        for index,mask in enumerate(valid_ids[0]):
            if index == 0:
                continue
            if mask == 1:
                logits.append((logits_label[index-pos],logits_confidence[index-pos]))
            else:
                pos += 1
        logits.pop()
        #print(logits)
        labels = [(self.label_map[label],confidence) for label,confidence in logits]
        #print(labels)
        tags = [self.label_map[label] for label,_ in logits]
        words = text.split()
        #print(tags)

        if ok :
            classifier=load_model("classifier.h5")
            tensor_idx=0
            n_count=0
            embedding=[]
            for i in range(len(tokens)):

                 if (valid_ids[0][i]!=0) and (tokens[i]!='[CLS]') and (tokens[i]!='[SEP]'):
                     tensor_idx+=1
                     if tags[tensor_idx-1]!='O':
                         n_count+=1
                         if n_count==1:
                             embedding=[bert_embed[0,tensor_idx,:].tolist()+logit[0,tensor_idx,1:9].tolist()]
                         else:
                             embedding.append(bert_embed[0,tensor_idx,:].tolist()+logit[0,tensor_idx,1:9].tolist())

       # print(len(embedding))
       #classifier predictions
            embedding = np.array(embedding).reshape(-1,776,1)
            predictions = classifier.predict(embedding)
            y_pred=[np.argmax(predictions[i]) for i in range(len(embedding))]
            y_pred=[idx2tag[i] for i in y_pred]
            tags1 = [self.label_map[label] for label,_ in logits]

            j=0
            for i in range(len(tags)):
                 if tags[i]!='O' and tags[i][0]!='I':
                      tags[i]="B-"+y_pred[j]
                      j+=1
                 elif tags[i][0]=='I':
                      tags[i]="I-"+tags[i-1][2:]
                      j+=1
        assert len(labels) == len(words)
        if ok==True:
             print("bert output :\n{}".format(tags1))
        output = [(word,{"tag":label}) for word,label in zip(words,tags)]

        return output, logit