-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcr_nlp.py
128 lines (88 loc) · 3.45 KB
/
cr_nlp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from transformers import AutoTokenizer
from transformers import pipeline
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
nltk.download('vader_lexicon')
def tokenize_text(text, model_name="bert-base-uncased"):
"""
Tokenize a given text using the Hugging Face Transformers library.
Parameters:
- text (str): The input text to tokenize.
- model_name (str): The name of the pre-trained model to use for tokenization.
Default is "bert-base-uncased".
Returns:
- tokens (list): List of tokens obtained by tokenizing the input text.
"""
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokens = tokenizer.tokenize(text)
return tokens
def analyze_sentiment(sentence: str) -> dict:
"""
Analyze the sentiment of a given sentence using a pre-trained BERT model.
This function utilizes the 'sentiment-analysis' pipeline from the Hugging Face
transformers library, which provides an easy-to-use interface to a BERT model
pre-trained on sentiment analysis tasks.
Parameters:
- sentence (str): The sentence for which to analyze the sentiment.
Returns:
- dict: A dictionary containing the label ('POSITIVE' or 'NEGATIVE') and the
associated confidence score.
Example usage:
sentiment = analyze_sentiment("I love this product!")
print(sentiment) # Output might look like: {'label': 'POSITIVE', 'score': 0.999}
"""
# Load the sentiment analysis pipeline
classifier = pipeline('sentiment-analysis')
# Analyze the sentiment of the sentence
results = classifier(sentence)
# Return the first result (the most likely sentiment)
return results[0]
# Helper function to convert NLTK POS tags to WordNet POS tags
def get_wordnet_pos(tag):
if tag.startswith('J'):
return wordnet.ADJ
elif tag.startswith('V'):
return wordnet.VERB
elif tag.startswith('N'):
return wordnet.NOUN
elif tag.startswith('R'):
return wordnet.ADV
else:
return None
from nltk.sentiment.vader import SentimentIntensityAnalyzer
def analyze_sentiment_vader(text):
"""
Analyzes the sentiment of a given text using VADER sentiment analysis.
Parameters:
- text: A string containing the text to analyze.
Returns:
- A dictionary containing the scores for negative, neutral, positive, and compound sentiments.
"""
sid = SentimentIntensityAnalyzer()
sentiment_scores = sid.polarity_scores(text)
return sentiment_scores
import nltk
from nltk.stem.porter import PorterStemmer
def stem_words(words):
"""
Stems a list of words.
This function applies the Porter Stemming algorithm to a list of words,
reducing each word to its root or stem form. It's particularly useful in
natural language processing and search applications where the exact form of
a word is less important than its root meaning.
Parameters:
- words: A list of words (strings) to be stemmed.
Returns:
- A list containing the stemmed version of each input word.
Example:
>>> stem_words(["running", "jumps", "easily"])
['run', 'jump', 'easili']
Note: This function requires the nltk's PorterStemmer to be imported.
"""
# Initialize the Porter Stemmer
stemmer = PorterStemmer()
# Stem each word in the list
stemmed_words = [stemmer.stem(word) for word in words]
return stemmed_words