Text analysis in R with multi-word and TF-IDF

133 Views Asked by At

I am quite new at R and I am trying to run a text analysis and TF-IDF in a bunch of reports considering a specific set of words in a dictionary I built. The code below has provided the results for that, however, it has failed to consider multi-words. For instance, it can count "technology" but not "data technology". I need to fix the code so multi-words are included in the analysis.

See the code I am using below:


    # Load libraries
    library(tidyverse)
    library(tm)
    library(tidytext)
    library(readxl)
    
    # Setting the folder where the documents are (set to subfolder 2012 for now to make it easier to handle)
    wd <- "C:/Users/ple.si/Dropbox (CBS)/Manegerial Digital Attention (MAD)/New set 10K/2012"
    
    # Create the corpus and clean it up a bit
    corpus <- Corpus(DirSource(wd, recursive = TRUE)) # Create corpus
    corpus <- tm_map(corpus, removePunctuation) # remove punctuation
    corpus <- tm_map(corpus, removeNumbers) # remove numbers
    corpus <- tm_map(corpus, removeWords, stopwords("english")) # remove English stop words
    
    # Create a DocumentTerm Matrix
    dtm <- DocumentTermMatrix(corpus)
    
    # Use multiple steps to...
    corpus_words <- tidy(dtm) %>% # ... transform the dtm to a tidy object
      bind_tf_idf(term, document, count) # ... use the tf_idf function from tidytext to calculate 
    
    total_words <- corpus_words %>% group_by(document) %>% summarize(total = sum(count)) # Calculate the number of words in each document
    corpus_words <- left_join(corpus_words, total_words) # add it to the table
    
    # Get the words of interest from the dictionary and rename the columns
    dictionary <- read_xlsx("C:/Users/ple.si/Dropbox (CBS)/Manegerial Digital Attention (MAD)/New set 10K/DictionaryLIWCDigital_OnlyDigital_TG.xlsx", col_names = FALSE)
    names(dictionary) <- c("term", "group")
    
    # Take the individual term lists
    inno_terms    <- dictionary$term[dictionary$group==1]
    techno_terms  <- dictionary$term[dictionary$group==2]
    data_terms    <- dictionary$term[dictionary$group==3]
    digital_terms <- dictionary$term[dictionary$group==4]
    
    # Filter the corpus for the words of interest
    TF_IDF_Inno_terms2 <- corpus_words %>% filter(grepl(paste(inno_terms, collapse = "|"), term))
    TF_IDF_techno_terms2 <- corpus_words %>% filter(grepl(paste(techno_terms, collapse = "|"), term))
    TF_IDF_data_terms2 <- corpus_words %>% filter(grepl(paste(data_terms, collapse = "|"), term))
    TF_IDF_digital_terms2 <- corpus_words %>% filter(grepl(paste(digital_terms, collapse = "|"), term))
    ```
    
    I've tried different variations of this code to address the multi-word issue with no success. 

#ATTEMP 2#

# Function to check if all multi-word terms are present in a document
check_multiword <- function(doc, multiword_terms) {
  all_terms_present <- all(sapply(multiword_terms, function(term) grepl(term, doc)))
  return(all_terms_present)
}

# Filter the corpus for documents containing multi-word terms of interest
docs_with_multiword_inno <- Filter(function(doc) check_multiword(doc, inno_terms), corpus)
docs_with_multiword_techno <- Filter(function(doc) check_multiword(doc, techno_terms), corpus)
docs_with_multiword_data <- Filter(function(doc) check_multiword(doc, data_terms), corpus)
docs_with_multiword_digital <- Filter(function(doc) check_multiword(doc, digital_terms), corpus)

#ATTEMPT 3#

# Filter the corpus words for the documents containing multi-word terms of interest
corpus_words_inno <- corpus_words %>% filter(document %in% docs_with_multiword_inno)
corpus_words_techno <- corpus_words %>% filter(document %in% docs_with_multiword_techno)
corpus_words_data <- corpus_words %>% filter(document %in% docs_with_multiword_data)
corpus_words_digital <- corpus_words %>% filter(document %in% docs_with_multiword_digital)

#ATTEMPT 4#

# Function to check if all multi-word terms are present in a document
check_multiword <- function(doc, multiword_terms) {
  all_terms_present <- all(sapply(multiword_terms, function(term) grepl(paste0("\\b", term, "\\b"), doc, ignore.case = TRUE)))
  return(all_terms_present)
}

# Filter the corpus for documents containing multi-word terms of interest
docs_with_multiword_inno <- Filter(function(doc) check_multiword(doc, inno_terms), corpus)
docs_with_multiword_techno <- Filter(function(doc) check_multiword(doc, techno_terms), corpus)
docs_with_multiword_data <- Filter(function(doc) check_multiword(doc, data_terms), corpus)
docs_with_multiword_digital <- Filter(function(doc) check_multiword(doc, digital_terms), corpus)

# Filter the corpus words for the documents containing multi-word terms of interest
corpus_words_inno <- corpus_words %>% filter(document %in% docs_with_multiword_inno)
corpus_words_techno <- corpus_words %>% filter(document %in% docs_with_multiword_techno)
corpus_words_data <- corpus_words %>% filter(document %in% docs_with_multiword_data)
corpus_words_digital <- corpus_words %>% filter(document %in% docs_with_multiword_digital)

#ATTEMPT 5#

# Create the corpus and clean it up a bit
corpus <- Corpus(DirSource(wd, recursive = TRUE)) # Create corpus
corpus <- tm_map(corpus, removePunctuation) # remove punctuation
corpus <- tm_map(corpus, removeNumbers) # remove numbers
corpus <- tm_map(corpus, removeWords, stopwords("english")) # remove English stop words

# Tokenize the text into n-grams (multiword terms)
multiword_terms <- c("new product", "new products", "new technologies", "new services",
                     "new solutions", "renew services", "artificial intelligence",
                     "machine learning", "data technology", "data security",
                     "data protection", "personal data", "data collection",
                     "store data", "internal data", "external data", "data privacy",
                     "data centers", "data driven", "customer data", "data from customer",
                     "data science", "data collection", "data analysis", "big data",
                     "market data", "data sets")
custom_tokenizer <- function(x) {
  unlist(lapply(ngrams(words(x), n = 1:2), paste, collapse = " "))
}

corpus <- tm_map(corpus, content_transformer(custom_tokenizer))

# Create a DocumentTerm Matrix
dtm <- DocumentTermMatrix(corpus)

# Get the words of interest from the dictionary and rename the columns
dictionary <- read_xlsx("C:/Users/ple.si/Dropbox (CBS)/Manegerial Digital Attention (MAD)/New set 10K/DictionaryLIWCDigital_OnlyDigital_TG.xlsx", col_names = FALSE)
names(dictionary) <- c("term", "group")

# Take the individual term lists
inno_terms    <- c(dictionary$term[dictionary$group == 1], multiword_terms)
techno_terms  <- c(dictionary$term[dictionary$group == 2], multiword_terms)
data_terms    <- c(dictionary$term[dictionary$group == 3], multiword_terms)
digital_terms <- c(dictionary$term[dictionary$group == 4], multiword_terms)

# Use multiple steps to...
corpus_words <- tidy(dtm) %>% # ... transform the dtm to a tidy object
  bind_tf_idf(term, document, count) # ... use the tf_idf function from tidytext to calculate 

total_words <- corpus_words %>% group_by(document) %>% summarize(total = sum(count)) # Calculate the number of words in each document
corpus_words <- left_join(corpus_words, total_words) # add it to the table

# Filter the corpus for the words of interest
TF_IDF_Inno_terms5 <- corpus_words %>% filter(term %in% inno_terms)
TF_IDF_techno_terms5 <- corpus_words %>% filter(term %in% techno_terms)
TF_IDF_data_terms5 <- corpus_words %>% filter(term %in% data_terms)
TF_IDF_digital_terms5 <- corpus_words %>% filter(term %in% digital_terms)

#Besides these attempts, I have also tried some of Julias' approaches using ngrams. The situation now is that the code returns only unigrams and it does not retrieve the bigrams I have in the dictionary (I have manually checked, and several of the bigrams in my dictionary are in the files).#
------------------------------------------------------------------------

#ATTEMPT 6#
---------

    # Set paths
    folder_path <- "C:/Users/ple.si/Dropbox (CBS)/Manegerial Digital Attention (MAD)/New set 10K/2012"
    dictionary_path <- "C:/Users/ple.si/Dropbox (CBS)/Manegerial Digital Attention (MAD)/New set 10K/DictionaryLIWCDigital_OnlyDigital_TG.xlsx"
    
    # Read dictionary words
    dictionary <- read_excel(dictionary_path)
    dictionary_words <- dictionary$Word
    
    # Read all text files
    all_text <- list.files(path = folder_path, full.names = TRUE) %>%
      map_chr(read_file) %>%
      enframe(name = NULL, value = "text")
    
    # Tokenize unigrams and bigrams
    tidy_tokens <- all_text %>%
      unnest_tokens(token, text) %>%
      filter(token %in% dictionary_words)  # Filter by dictionary words
    
    tidy_bigrams <- all_text %>%
      unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
      filter(str_replace_all(bigram, " ", "_") %in% dictionary_words)  # Filter by dictionary words
    
    # Combine unigrams and bigrams
    combined_tokens <- bind_rows(
      tidy_tokens %>% mutate(type = "unigram"),
      tidy_bigrams %>% mutate(type = "bigram")
    )
    
    # Count token frequencies
    token_counts <- combined_tokens %>%
      count(type, token, sort = TRUE)
    
    token_counts

#ATTEMPT 7#

    > '# Set paths folder_path <- "C:/Users/ple.si/Dropbox (CBS)/Manegerial
    > Digital Attention (MAD)/New set 10K/2012" dictionary_path <-
    > "C:/Users/ple.si/Dropbox (CBS)/Manegerial Digital Attention (MAD)/New
    > set 10K/DictionaryLIWCDigital_OnlyDigital_TG.xlsx"
    > 
    > # Read dictionary words dictionary <- read_excel(dictionary_path) dictionary_words <- dictionary$Word
    > 
    > # Read all text files all_text <- list.files(path = folder_path, full.names = TRUE) %>%   map_chr(read_file) %>%   enframe(name = NULL,
    > value = "text")
    > 
    > # Tokenize unigrams and bigrams tidy_tokens <- all_text %>%   unnest_tokens(token, text) %>%   filter(token %in% dictionary_words) 
    > # Filter by dictionary words
    > 
    > tidy_bigrams <- all_text %>%   unnest_tokens(bigram, text, token =
    > "ngrams", n = 2) 
    > 
    > # Filter bigrams using inner_join with dictionary words tidy_bigrams <- tidy_bigrams %>%   separate(bigram, c("word1", "word2"), sep = " ")
    > %>%   filter(word1 %in% dictionary_words, word2 %in% dictionary_words)
    > %>%   unite(bigram, word1, word2, sep = " ")  # Rejoin the filtered
    > words
    > 
    > # Combine unigrams and bigrams combined_tokens <- bind_rows(   tidy_tokens %>% mutate(type = "unigram"),   tidy_bigrams %>%
    > mutate(type = "bigram") )
    > 
    > # Count token frequencies token_counts <- combined_tokens %>%   count(type, token, sort = TRUE)
    > 
    > token_counts

Thanks :) 


1

There are 1 best solutions below

2
Julia Silge On

If you want to analyze texts at a different level of tokenization than the single word, you'll need to set that tokenization up yourself (probably not use Corpus()).

You can find bigrams like this:

library(tidyverse)
library(tidytext)

tidy_bigrams <- janeaustenr::austen_books() |> 
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

bigram_counts <- tidy_bigrams |> 
  count(book, bigram, sort = TRUE) |> 
  filter(!is.na(bigram))

bigram_counts
#> # A tibble: 300,903 × 3
#>    book                bigram     n
#>    <fct>               <chr>  <int>
#>  1 Mansfield Park      of the   712
#>  2 Mansfield Park      to be    612
#>  3 Emma                to be    586
#>  4 Mansfield Park      in the   533
#>  5 Emma                of the   529
#>  6 Pride & Prejudice   of the   439
#>  7 Emma                it was   430
#>  8 Pride & Prejudice   to be    422
#>  9 Sense & Sensibility to be    418
#> 10 Emma                in the   416
#> # ℹ 300,893 more rows

Created on 2023-08-15 with reprex v2.0.2

And then you can compute tf-idf for these bigrams like this:

bigram_counts |> 
  bind_tf_idf(bigram, book, n) |> 
  arrange(desc(tf_idf))
#> # A tibble: 300,903 × 6
#>    book                bigram                n      tf   idf  tf_idf
#>    <fct>               <chr>             <int>   <dbl> <dbl>   <dbl>
#>  1 Pride & Prejudice   mr darcy            230 0.00206  1.79 0.00370
#>  2 Persuasion          captain wentworth   143 0.00187  1.79 0.00335
#>  3 Mansfield Park      sir thomas          266 0.00181  1.79 0.00324
#>  4 Persuasion          mr elliot           133 0.00174  1.79 0.00312
#>  5 Sense & Sensibility mrs jennings        185 0.00169  1.79 0.00303
#>  6 Emma                mr knightley        239 0.00162  1.79 0.00291
#>  7 Persuasion          lady russell        110 0.00144  1.79 0.00258
#>  8 Persuasion          sir walter          108 0.00141  1.79 0.00253
#>  9 Emma                mrs weston          208 0.00141  1.79 0.00253
#> 10 Mansfield Park      miss crawford       196 0.00133  1.79 0.00239
#> # ℹ 300,893 more rows

Created on 2023-08-15 with reprex v2.0.2

Created on 2023-08-15 with reprex v2.0.2

If you want to include, say, unigrams (words) plus bigrams you would use n = 2, n_min = 1 in the call to unnest_tokens().