Use to identify extreme values, based on cleaned note length, in a dataset.
Can be used after applying firstnchar()
Examples
test_dataset <- data.frame(ID=c("1","1","2","2","1", "3","3"),
Notes=c("The","The cat","The","The dog","The cat ran",
"the chicken was chased", "The goat chased the chicken"),
Page=c(1,2,1,2,3,1,2))
cleaned_dataset<-
firstnchar(dataset=test_dataset,notes="Notes",char_diff=3,
identifier="ID",pageid="Page")
extremeid(dataset=cleaned_dataset,clean_notes="page_notes",extreme=2,pageid="Page")
#> ID Notes Page page_notes edit_distance
#> 1 1 The 1 The NA
#> 2 1 The cat 2 cat 0
#> 3 2 The 1 The NA
#> 4 2 The dog 2 dog 0
#> 5 1 The cat ran 3 ran 0
#> 6 3 the chicken was chased 1 the chicken was chased NA
#> 7 3 The goat chased the chicken 2 The goat chased the chicken 17
#> note_length outlier mean sd extreme_value
#> 1 3 31.27264 9.333333 10.96966 FALSE
#> 2 3 38.71281 11.000000 13.85641 FALSE
#> 3 3 31.27264 9.333333 10.96966 FALSE
#> 4 3 38.71281 11.000000 13.85641 FALSE
#> 5 3 NA 3.000000 NA NA
#> 6 22 31.27264 9.333333 10.96966 FALSE
#> 7 27 38.71281 11.000000 13.85641 FALSE