How to annotate word constituents within sentences?

Question

How to annotate word constituents within sentences?

60 Views Asked by Linkr1 At 21 October 2021 at 17:02

I am using openNLP to annotate words within sentences throughout a text. As a final result, I would like word ID to match their order within in each sentences, with the order starting from 1 each time we enter a new sentence). Here is what I have so far:

#create string
string  <-  paste0("Last morning, I went to the lake and sat. My dog is the cutest.")
ex_string  <-  as.String(string)
#annotate words and sentences
init_s_w  <-  annotate(ex_string, list(Maxent_Sent_Token_Annotator(probs=TRUE),
                                       Maxent_Word_Token_Annotator(probs=TRUE)))
init_s_w

id	type	start	end
1	sentence	1	41
2	sentence	43	63
3	word	1	4
4	word	6	12
5	word	13	13
6	word	15	15
7	word	17	20
8	word	22	23
9	word	25	27
10	word	29	32
11	word	34	36
12	word	38	40
13	word	41	41
14	word	43	44
15	word	46	48
16	word	50	51
17	word	53	55
18	word	57	62
19	word	63	63

Here is what I want:

id	type	start	end
1	sentence	1	41
2	sentence	43	63
1	word	1	4
2	word	6	12
3	word	13	13
4	word	15	15
5	word	17	20
6	word	22	23
7	word	25	27
8	word	29	32
9	word	34	36
10	word	38	40
11	word	41	41
1	word	43	44
2	word	46	48
3	word	50	51
4	word	53	55
5	word	57	62
6	word	63	63

Original Q&A

There are 1 best solutions below

**Skaqqs** · Answer 1 · 2021-10-21T19:43:33.663000

By manipulating your input table:

library(dplyr)

r <- a[a$type == "sentence", "end"]
a$pos <- cut(a$start, breaks = c(1,r), include.lowest = TRUE, right = TRUE)
b <- a[a$type == "word",]
d <- b %>%
  group_by(pos) %>%
  mutate(id = 1:n())

bind_rows(a[a$type == "sentence",], d)

By starting from the beginning and building a more comprehensive dataset:

# Split the string into sentences
string  <-  "Last morning, I went to the lake and sat. My dog is the cutest."
split <- unlist(strsplit(string, "(?<=\\.)", perl = TRUE))
split
#> [1] "Last morning, I went to the lake and sat."
#> [2] " My dog is the cutest."

# Split each sentence into words
# Retain punctuation and spaces as elements (using lookbehind)
# Each split sentence is saved as a sublist
a <- lapply(split, function(x) strsplit(x, split = "(?=\\s)|(?=[[:punct:]])", perl = TRUE))
b <- unlist(a, recursive = FALSE)
b
#> [[1]]
#>  [1] "Last"    " "       "morning" ","       " "       "I"       " "      
#>  [8] "went"    " "       "to"      " "       "the"     " "       "lake"   
#> [15] " "       "and"     " "       "sat"     "."      
#> 
#> [[2]]
#>  [1] " "      "My"     " "      "dog"    " "      "is"     " "      "the"   
#>  [9] " "      "cutest" "."

# Combine each sublist as a dataframe
df <- data.frame(word = c())
for(i in 1:length(b)){
  temp <- data.frame(b[[i]])
  names(temp) <- "word"
  temp$sentence <- i
  df <- rbind(df, temp)
}

# Get organized and calculate word positions
df$char <- nchar(df$word)
df$end <- ave(df$char, df$sentence, FUN=cumsum)
df$id <- ave(df$char, df$sentence, FUN=seq_along)
df$start <- df$end+1 - df$char
df$newid <- ave(df$char, df$sentence, FUN=seq_along)
df$constant <- 1
df$end_overall <- ave(df$char, df$constant, FUN=cumsum)
df$start_overall <- df$end_overall+1 - df$char

# Full dataset
df <- df[,c("sentence", "id", "word", "char", "start", "end", "start_overall", "end_overall")]
df
#>    sentence id    word char start end start_overall end_overall
#> 1         1  1    Last    4     1   4             1           4
#> 2         1  2            1     5   5             5           5
#> 3         1  3 morning    7     6  12             6          12
#> 4         1  4       ,    1    13  13            13          13
#> 5         1  5            1    14  14            14          14
#> 6         1  6       I    1    15  15            15          15
#> 7         1  7            1    16  16            16          16
#> 8         1  8    went    4    17  20            17          20
#> 9         1  9            1    21  21            21          21
#> 10        1 10      to    2    22  23            22          23
#> 11        1 11            1    24  24            24          24
#> 12        1 12     the    3    25  27            25          27
#> 13        1 13            1    28  28            28          28
#> 14        1 14    lake    4    29  32            29          32
#> 15        1 15            1    33  33            33          33
#> 16        1 16     and    3    34  36            34          36
#> 17        1 17            1    37  37            37          37
#> 18        1 18     sat    3    38  40            38          40
#> 19        1 19       .    1    41  41            41          41
#> 20        2  1            1     1   1            42          42
#> 21        2  2      My    2     2   3            43          44
#> 22        2  3            1     4   4            45          45
#> 23        2  4     dog    3     5   7            46          48
#> 24        2  5            1     8   8            49          49
#> 25        2  6      is    2     9  10            50          51
#> 26        2  7            1    11  11            52          52
#> 27        2  8     the    3    12  14            53          55
#> 28        2  9            1    15  15            56          56
#> 29        2 10  cutest    6    16  21            57          62
#> 30        2 11       .    1    22  22            63          63

# Or to match your expected result
df2 <- df[!(df$word %in% c(" ")),]
df2$newid <- ave(df2$char, df2$sentence, FUN=seq_along)
df2$constant <- 1
df2$end2 <- ave(df2$char, df2$constant, FUN=cumsum)
df2$start2 <- df2$end2+1 - df2$char
df2$type <- "word"
df3 <- rbind(df2[,c("type", "start", "end", "start_overall", "end_overall")],
             c("sentence", 1, 41, 1, 41),
             c("sentence", 43, 63, 43, 63))
df3
#>         type start end start_overall end_overall
#> 1       word     1   4             1           4
#> 3       word     6  12             6          12
#> 4       word    13  13            13          13
#> 6       word    15  15            15          15
#> 8       word    17  20            17          20
#> 10      word    22  23            22          23
#> 12      word    25  27            25          27
#> 14      word    29  32            29          32
#> 16      word    34  36            34          36
#> 18      word    38  40            38          40
#> 19      word    41  41            41          41
#> 21      word     2   3            43          44
#> 23      word     5   7            46          48
#> 25      word     9  10            50          51
#> 27      word    12  14            53          55
#> 29      word    16  21            57          62
#> 30      word    22  22            63          63
#> 181 sentence     1  41             1          41
#> 191 sentence    43  63            43          63
Created on 2021-10-22 by the reprex package (v2.0.1)

How to annotate word constituents within sentences?

There are 1 best solutions below

Related Questions in R

Related Questions in STRING

Related Questions in ANNOTATIONS

Related Questions in OPENNLP

Trending Questions

Popular # Hahtags

Popular Questions