-
Notifications
You must be signed in to change notification settings - Fork 0
/
lda.R
77 lines (58 loc) · 3.59 KB
/
lda.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
options(java.parameters = "-Xmx8g")
# Upload the following packages (install first if not already downloaded)
library(topicmodels)
library(doParallel)
library(ggplot2)
library(scales)
library(slam)
library(tictoc)
#First, download the Meaning Extraction Helper (MEH) here: https://www.ryanboyd.io/software/meh/download/
# Once downloaded, open MEH and select the following parameters (if I don't mention something, leave it on the default selection):
# Input File Settings: upload "tweets.csv"; "id" is the column to be used as row identifier; "cleaned_tweets" is the column containing text
# Output Generation: select folder where you want the output to be saved
# Text Segmentation: select "no segmentation"
# Conversion List: use default selections
# Stop List: use default selections
# Token Handling Options: use default selections
# Choose Output Types: adjust number in "Prune Frequency List after X Docs" to whatever;
# de-select Binary Document by Term Matrix" and "Verbose Document by Term Matrix"; select "Raw Count Document by Term Matrix"
# N-gram Settings: Insert number in "Ignore Documents with a Word Count less than"
# A (min) = 1; B (max) = 1; select "Retain the X most Frequent N-grams (by raw frequency)
# Input your selected threshold parameter (X) here: I've used 500 in the past, but may need to adjust
# Once you finish selecting the parameters, click "Start!" under Begin Analysis
# Import the MEH_DTM_RawCount file into R from output folder
# Make it a dataframe named "DF_DTMatrix"
DF_DTMatrix <- as.data.frame(X2022_11_08_MEH_DTM_RawCount)
# If you know the number of topics you want to extract:
# Replace w/ the number of topics you want to extract
NumberOfTopics <- 6
# Indicate the number of "most likely" terms that you want to show for any given topic.
# I've used 20 previously but change this to whatever you want
MaximumTermsPerTopic <- 20
# Create a folder named "Results" in current working directory for our results
dir.create("Results", showWarnings = FALSE)
# Strip away the extra info from our document term matrix (Segment, WC, RawTokenCount)
DF_DTMatrix <- DF_DTMatrix[, !names(DF_DTMatrix) %in% c('Segment', 'WC', 'RawTokenCount')]
# Only include rows that don't sum to 0 (i.e., has at least one of our target words)
rowTotals <- apply(DF_DTMatrix[2:length(DF_DTMatrix)] , 1, sum)
DF_DTMatrix <- DF_DTMatrix[rowTotals > 0,]
remove(rowTotals)
# Sets aside our filenames
Filenames <- DF_DTMatrix$Filename
DF_DTMatrix <- DF_DTMatrix[, !names(DF_DTMatrix) %in% c("Filename")]
# Run LDA model; I typically use "Gibbs" and alpha = 0.1
lda.model <- LDA(DF_DTMatrix, k=NumberOfTopics, method="Gibbs", control = list(alpha = 0.1))
# Dump our term results to a .csv file in the "Results" folder
DataToPrint <- terms(lda.model, MaximumTermsPerTopic)
write.csv(DataToPrint, paste("Results/", Sys.Date(), "_-_LDA_Topic_Terms.csv", sep=""), na="", row.names=FALSE)
# This prints the most likely topic for each document to a file in the "Results" folder
DataToPrint <- data.frame(Filenames)
DataToPrint$Topics <- topics(lda.model)
write.csv(DataToPrint, paste("Results/", Sys.Date(), "_-_LDA_Document_Categorization.csv", sep=""), na="", row.names=FALSE)
# Rank topic terms for each topic; saves to a file in "Results" folder
tmResult <- posterior(lda.model)
attributes(tmResult)
beta <- tmResult$terms
dim(beta)
topictermsranked <- apply(lda::top.topic.words(beta, 20, by.score = T), 2, paste, collapse = " ")
write.csv(topictermsranked, paste("Results/", Sys.Date(), "_-_Topic_Terms_Ranked.csv", sep=""), na="", row.names=FALSE)