Find the variance based on a threshold in R

176 Views Asked by At

This datasets contains 42 IDs with 1440 observations per ID. This means that the dataset got 60480 observations in total. The values of the columns AI_1..., to AI_7 got values that range from 0.00 to approx 3.. (depending on ID). I try to make certain tresholds for each column (AI_1 to AI_7) that gives the value 1 if AI_n < 0 and 0 if AI_n > 0. This values should be entered in a new column called activity_1, ..., activity_7. I did this with the following code which worked:

suppressPackageStartupMessages({
  library(dplyr)
  library(stringr)
})

Threshold <- AI %>%
  select(-starts_with("activity")) %>%
  mutate(across(starts_with("AI_"), ~ as.integer(.x <= 0.0), .names = "activity_{col}")) %>%
  rename_at(vars(starts_with("activity_AI")), ~ str_remove(., "_AI_"))

What I want is to create the Standard deviation and Variation of the AI_1, ..., AI_7 grouped by ID only for those threshold values that exceed the 0. I.e., where the activity_1, ..., activity_7 was 0. Because the 0 values were the point that were bigger than the value 0. I tried it with the following code but it also takes the 0.0 values into account which I want to exclude:

AI_SD <- Threshold %>% group_by(ID) %>% summarise(across(everything(), sd))

This code makes a new dataframe where the 1440 point are converted to 1 point per ID so now we have 42 observation instead of 60480.

This is the data without the activity thresholds

structure(list(X = 1:20, x1.time = c("00:00:00", "00:01:00", 
"00:02:00", "00:03:00", "00:04:00", "00:05:00", "00:06:00", "00:07:00", 
"00:08:00", "00:09:00", "00:10:00", "00:11:00", "00:12:00", "00:13:00", 
"00:14:00", "00:15:00", "00:16:00", "00:17:00", "00:18:00", "00:19:00"
), AI_1 = c(0.17532896077581, 0.174249939439765, 0.174170544792533, 
0.172877357886967, 0.173679017353614, 0.174216799443538, 0.174514454250882, 
0.174656389074666, 0.173377175454716, 0.173044040397703, 0.172476572884875, 
0.174738790856458, 0.173833445732856, 0.174229265722835, 0.174392878820111, 
0.174715890976243, 0.174241614289181, 0.173229751013599, 0.173579164085914, 
0.173829069216696), AI_2 = c(0.173549588758752, 0, 0.85729795236214, 
0.513925586220723, 0.140789239632585, 0.0989981552300843, 0.321625480480368, 
0.62540390366724, 0.00714855410741877, 0, 0, 0, 0.212943798631015, 
0, 0, 0.023650258664654, 0.00159158576982517, 0.0172670511608436, 
0, 0), AI_3 = c(0.026069149474549, 0.0417747330978121, 0.276687600798659, 
0.258591321128928, 0.208790296683244, 0.0300099278967508, 0.15234594700642, 
0.26519848659315, 0.34220566727692, 0.352310255219813, 0.297621781376737, 
0.292800000618149, 0.481566536382664, 0.337770306519177, 0.743182296874282, 
0.256202127993172, 0.201340506649845, 0.200155318345632, 0.237126429055375, 
0.234974163009848), AI_4 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0), AI_5 = c(0, 0, 0.0015062890214412, 
0.00154798776365785, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0), AI_6 = c(0.190018331633492, 0.241159552783285, 0.231916111803065, 
0.193196835220518, 0.240381778378367, 0.266125762332231, 0.339227319507121, 
0.354841547583334, 0.277011867279295, 0.474462632995715, 0.516356521276347, 
0.559477604383845, 0.374857636694405, 0.376675155204282, 0.516347133869462, 
0.627633542885353, 0.565732682034457, 0.544148310829377, 0.545022418887296, 
0.602327138107482), AI_7 = c(0.139608768263461, 0.165583663096789, 
0.326959508587122, 0.221739297198209, 0.160657663051105, 0.107439748199699, 
0.117594125364214, 0.133528520361788, 0.117950354159875, 0.131428192187155, 
0.125355403562937, 0.119185646272255, 0.196285453922129, 0.167061057207379, 
0.169855099745761, 0.141077126343563, 0.078433720675593, 0.0999303057993443, 
0.0798045801131668, 0.0331137028671696), ID = c("ID1", "ID1", 
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", 
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1"
)), row.names = c(NA, 20L), class = "data.frame")

This is the data with the activity thresholds

structure(list(X = 1:20, x1.time = c("00:00:00", "00:01:00", 
"00:02:00", "00:03:00", "00:04:00", "00:05:00", "00:06:00", "00:07:00", 
"00:08:00", "00:09:00", "00:10:00", "00:11:00", "00:12:00", "00:13:00", 
"00:14:00", "00:15:00", "00:16:00", "00:17:00", "00:18:00", "00:19:00"
), AI_1 = c(0.17532896077581, 0.174249939439765, 0.174170544792533, 
0.172877357886967, 0.173679017353614, 0.174216799443538, 0.174514454250882, 
0.174656389074666, 0.173377175454716, 0.173044040397703, 0.172476572884875, 
0.174738790856458, 0.173833445732856, 0.174229265722835, 0.174392878820111, 
0.174715890976243, 0.174241614289181, 0.173229751013599, 0.173579164085914, 
0.173829069216696), AI_2 = c(0.173549588758752, 0, 0.85729795236214, 
0.513925586220723, 0.140789239632585, 0.0989981552300843, 0.321625480480368, 
0.62540390366724, 0.00714855410741877, 0, 0, 0, 0.212943798631015, 
0, 0, 0.023650258664654, 0.00159158576982517, 0.0172670511608436, 
0, 0), AI_3 = c(0.026069149474549, 0.0417747330978121, 0.276687600798659, 
0.258591321128928, 0.208790296683244, 0.0300099278967508, 0.15234594700642, 
0.26519848659315, 0.34220566727692, 0.352310255219813, 0.297621781376737, 
0.292800000618149, 0.481566536382664, 0.337770306519177, 0.743182296874282, 
0.256202127993172, 0.201340506649845, 0.200155318345632, 0.237126429055375, 
0.234974163009848), AI_4 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0), AI_5 = c(0, 0, 0.0015062890214412, 
0.00154798776365785, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0), AI_6 = c(0.190018331633492, 0.241159552783285, 0.231916111803065, 
0.193196835220518, 0.240381778378367, 0.266125762332231, 0.339227319507121, 
0.354841547583334, 0.277011867279295, 0.474462632995715, 0.516356521276347, 
0.559477604383845, 0.374857636694405, 0.376675155204282, 0.516347133869462, 
0.627633542885353, 0.565732682034457, 0.544148310829377, 0.545022418887296, 
0.602327138107482), AI_7 = c(0.139608768263461, 0.165583663096789, 
0.326959508587122, 0.221739297198209, 0.160657663051105, 0.107439748199699, 
0.117594125364214, 0.133528520361788, 0.117950354159875, 0.131428192187155, 
0.125355403562937, 0.119185646272255, 0.196285453922129, 0.167061057207379, 
0.169855099745761, 0.141077126343563, 0.078433720675593, 0.0999303057993443, 
0.0798045801131668, 0.0331137028671696), ID = c("ID1", "ID1", 
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", 
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1"
), activity1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), activity2 = c(0L, 1L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 
1L), activity3 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), activity4 = c(1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L), activity5 = c(1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), activity6 = c(0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L), activity7 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(NA, 20L), class = "data.frame")

**The question is: how can I find the variance or standard deviation grouped by ID only for those values that have the value 0 (i.e., exceeds the threshold of 0)**
1

There are 1 best solutions below

0
langtang On BEST ANSWER

You can limit the call to sd to only those elements of .x where .x exceeds 0.

data %>% 
  group_by(ID) %>% 
  summarize(across(starts_with("AI"),~sd(.x[.x>0])))

Output:

  ID        AI_1  AI_2  AI_3  AI_4      AI_5  AI_6   AI_7
  <chr>    <dbl> <dbl> <dbl> <dbl>     <dbl> <dbl>  <dbl>
1 ID1   0.000715 0.278 0.160    NA 0.0000295 0.150 0.0610