How to annotate word constituents within sentences?

60 Views Asked by At

I am using openNLP to annotate words within sentences throughout a text. As a final result, I would like word ID to match their order within in each sentences, with the order starting from 1 each time we enter a new sentence). Here is what I have so far:

#create string
string  <-  paste0("Last morning, I went to the lake and sat. My dog is the cutest.")
ex_string  <-  as.String(string)
#annotate words and sentences
init_s_w  <-  annotate(ex_string, list(Maxent_Sent_Token_Annotator(probs=TRUE),
                                       Maxent_Word_Token_Annotator(probs=TRUE)))
init_s_w 
id type start end
1 sentence 1 41
2 sentence 43 63
3 word 1 4
4 word 6 12
5 word 13 13
6 word 15 15
7 word 17 20
8 word 22 23
9 word 25 27
10 word 29 32
11 word 34 36
12 word 38 40
13 word 41 41
14 word 43 44
15 word 46 48
16 word 50 51
17 word 53 55
18 word 57 62
19 word 63 63

Here is what I want:

id type start end
1 sentence 1 41
2 sentence 43 63
1 word 1 4
2 word 6 12
3 word 13 13
4 word 15 15
5 word 17 20
6 word 22 23
7 word 25 27
8 word 29 32
9 word 34 36
10 word 38 40
11 word 41 41
1 word 43 44
2 word 46 48
3 word 50 51
4 word 53 55
5 word 57 62
6 word 63 63
1

There are 1 best solutions below

0
Skaqqs On

By manipulating your input table:

library(dplyr)

r <- a[a$type == "sentence", "end"]
a$pos <- cut(a$start, breaks = c(1,r), include.lowest = TRUE, right = TRUE)
b <- a[a$type == "word",]
d <- b %>%
  group_by(pos) %>%
  mutate(id = 1:n())

bind_rows(a[a$type == "sentence",], d)

By starting from the beginning and building a more comprehensive dataset:

# Split the string into sentences
string  <-  "Last morning, I went to the lake and sat. My dog is the cutest."
split <- unlist(strsplit(string, "(?<=\\.)", perl = TRUE))
split
#> [1] "Last morning, I went to the lake and sat."
#> [2] " My dog is the cutest."

# Split each sentence into words
# Retain punctuation and spaces as elements (using lookbehind)
# Each split sentence is saved as a sublist
a <- lapply(split, function(x) strsplit(x, split = "(?=\\s)|(?=[[:punct:]])", perl = TRUE))
b <- unlist(a, recursive = FALSE)
b
#> [[1]]
#>  [1] "Last"    " "       "morning" ","       " "       "I"       " "      
#>  [8] "went"    " "       "to"      " "       "the"     " "       "lake"   
#> [15] " "       "and"     " "       "sat"     "."      
#> 
#> [[2]]
#>  [1] " "      "My"     " "      "dog"    " "      "is"     " "      "the"   
#>  [9] " "      "cutest" "."

# Combine each sublist as a dataframe
df <- data.frame(word = c())
for(i in 1:length(b)){
  temp <- data.frame(b[[i]])
  names(temp) <- "word"
  temp$sentence <- i
  df <- rbind(df, temp)
}

# Get organized and calculate word positions
df$char <- nchar(df$word)
df$end <- ave(df$char, df$sentence, FUN=cumsum)
df$id <- ave(df$char, df$sentence, FUN=seq_along)
df$start <- df$end+1 - df$char
df$newid <- ave(df$char, df$sentence, FUN=seq_along)
df$constant <- 1
df$end_overall <- ave(df$char, df$constant, FUN=cumsum)
df$start_overall <- df$end_overall+1 - df$char

# Full dataset
df <- df[,c("sentence", "id", "word", "char", "start", "end", "start_overall", "end_overall")]
df
#>    sentence id    word char start end start_overall end_overall
#> 1         1  1    Last    4     1   4             1           4
#> 2         1  2            1     5   5             5           5
#> 3         1  3 morning    7     6  12             6          12
#> 4         1  4       ,    1    13  13            13          13
#> 5         1  5            1    14  14            14          14
#> 6         1  6       I    1    15  15            15          15
#> 7         1  7            1    16  16            16          16
#> 8         1  8    went    4    17  20            17          20
#> 9         1  9            1    21  21            21          21
#> 10        1 10      to    2    22  23            22          23
#> 11        1 11            1    24  24            24          24
#> 12        1 12     the    3    25  27            25          27
#> 13        1 13            1    28  28            28          28
#> 14        1 14    lake    4    29  32            29          32
#> 15        1 15            1    33  33            33          33
#> 16        1 16     and    3    34  36            34          36
#> 17        1 17            1    37  37            37          37
#> 18        1 18     sat    3    38  40            38          40
#> 19        1 19       .    1    41  41            41          41
#> 20        2  1            1     1   1            42          42
#> 21        2  2      My    2     2   3            43          44
#> 22        2  3            1     4   4            45          45
#> 23        2  4     dog    3     5   7            46          48
#> 24        2  5            1     8   8            49          49
#> 25        2  6      is    2     9  10            50          51
#> 26        2  7            1    11  11            52          52
#> 27        2  8     the    3    12  14            53          55
#> 28        2  9            1    15  15            56          56
#> 29        2 10  cutest    6    16  21            57          62
#> 30        2 11       .    1    22  22            63          63

# Or to match your expected result
df2 <- df[!(df$word %in% c(" ")),]
df2$newid <- ave(df2$char, df2$sentence, FUN=seq_along)
df2$constant <- 1
df2$end2 <- ave(df2$char, df2$constant, FUN=cumsum)
df2$start2 <- df2$end2+1 - df2$char
df2$type <- "word"
df3 <- rbind(df2[,c("type", "start", "end", "start_overall", "end_overall")],
             c("sentence", 1, 41, 1, 41),
             c("sentence", 43, 63, 43, 63))
df3
#>         type start end start_overall end_overall
#> 1       word     1   4             1           4
#> 3       word     6  12             6          12
#> 4       word    13  13            13          13
#> 6       word    15  15            15          15
#> 8       word    17  20            17          20
#> 10      word    22  23            22          23
#> 12      word    25  27            25          27
#> 14      word    29  32            29          32
#> 16      word    34  36            34          36
#> 18      word    38  40            38          40
#> 19      word    41  41            41          41
#> 21      word     2   3            43          44
#> 23      word     5   7            46          48
#> 25      word     9  10            50          51
#> 27      word    12  14            53          55
#> 29      word    16  21            57          62
#> 30      word    22  22            63          63
#> 181 sentence     1  41             1          41
#> 191 sentence    43  63            43          63
Created on 2021-10-22 by the reprex package (v2.0.1)