all_lower_case()
: Translate all non-numeric strings of a data frame to lower case.all_upper_case()
: Translate all non-numeric strings of a data frame to upper case.all_title_case()
: Translate all non-numeric strings of a data frame to title case.first_upper_case
: Translate the first word of a string to upper case.extract_number()
: Extract the number(s) of a string.extract_string()
: Extract all strings, ignoring case.find_text_in_num()
: Find text characters in a numeric sequence and return the row index.has_text_in_num()
: Inspect columns looking for text in numeric sequence and return a warning if text is found.remove_space()
: Remove all blank spaces of a string.remove_strings()
: Remove all strings of a variable.replace_number()
: Replace numbers with a replacement.replace_string()
: Replace all strings with a replacement, ignoring case.round_cols()
: Round a selected column or a whole data frame to significant figures.tidy_strings()
: Tidy up characters strings, non-numeric columns, or any selected columns in a data frame by putting all word in upper case, replacing any space, tabulation, punctuation characters by'_'
, and putting'_'
between lower and upper case. Suppose thatstr = c("Env1", "env 1", "env.1")
(which by definition should represent a unique level in plant breeding trials, e.g., environment 1) is subjected totidy_strings(str)
: the result will be thenc("ENV_1", "ENV_1", "ENV_1")
. See Examples section for more examples.
Usage
all_upper_case(.data, ...)
all_lower_case(.data, ...)
all_title_case(.data, ...)
first_upper_case(.data, ...)
extract_number(.data, ..., pattern = NULL)
extract_string(.data, ..., pattern = NULL)
find_text_in_num(.data, ...)
has_text_in_num(.data)
remove_space(.data, ...)
remove_strings(.data, ...)
replace_number(
.data,
...,
pattern = NULL,
replacement = "",
ignore_case = FALSE
)
replace_string(
.data,
...,
pattern = NULL,
replacement = "",
ignore_case = FALSE
)
round_cols(.data, ..., digits = 2)
tidy_strings(.data, ..., sep = "_")
Arguments
- .data
A data frame
- ...
The argument depends on the function used.
For
round_cols()
...
are the variables to round. If no variable is informed, all the numeric variables fromdata
are used.For
all_lower_case()
,all_upper_case()
,all_title_case()
,stract_number()
,stract_string()
,remove_strings()
, andtidy_strings()
...
are the variables to apply the function. If no variable is informed, the function will be applied to all non-numeric variables in.data
.
- pattern
A string to be matched. Regular Expression Syntax is also allowed.
- replacement
A string for replacement.
- ignore_case
If
FALSE
(default), the pattern matching is case sensitive and ifTRUE
, case is ignored during matching.- digits
The number of significant figures.
- sep
A character string to separate the terms. Defaults to "_".
Author
Tiago Olivoto tiagoolivoto@gmail.com
Examples
# \donttest{
library(metan)
################ Rounding numbers ###############
# All numeric columns
round_cols(data_ge2, digits = 1)
#> # A tibble: 156 × 18
#> ENV GEN REP PH EH EP EL ED CL CD CW KW NR
#> <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A1 H1 1 2.6 1.7 0.7 16.1 52.2 28.1 16.3 25.1 216. 15.6
#> 2 A1 H1 2 2.9 1.8 0.6 14.2 50.3 27.6 14.5 21.4 184. 16
#> 3 A1 H1 3 2.7 1.6 0.6 16 50.7 28.4 16.4 24 208. 17.2
#> 4 A1 H10 1 2.8 1.6 0.6 16.7 54.1 31.7 17.4 26.2 194. 15.6
#> 5 A1 H10 2 2.8 1.7 0.6 14.9 52.7 32 15.5 20.7 176. 17.6
#> 6 A1 H10 3 2.7 1.5 0.6 16.7 52.7 30.4 17.5 26.8 207. 16.8
#> 7 A1 H11 1 2.8 1.5 0.5 17.4 51.7 30.6 18 26.2 217. 16.8
#> 8 A1 H11 2 2.7 1.6 0.6 16.7 47.2 28.7 17.2 24.1 181. 13.6
#> 9 A1 H11 3 2.8 1.7 0.6 15.8 47.9 27.6 16.4 20.5 166. 15.2
#> 10 A1 H12 1 2.7 1.5 0.6 14.9 47.5 28.2 15.5 20.1 161 14.8
#> # … with 146 more rows, and 5 more variables: NKR <dbl>, CDED <dbl>,
#> # PERK <dbl>, TKW <dbl>, NKE <dbl>
# Round specific columns
round_cols(data_ge2, EP, digits = 1)
#> # A tibble: 156 × 18
#> ENV GEN REP PH EH EP EL ED CL CD CW KW NR
#> <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A1 H1 1 2.61 1.71 0.7 16.1 52.2 28.1 16.3 25.1 217. 15.6
#> 2 A1 H1 2 2.87 1.76 0.6 14.2 50.3 27.6 14.5 21.4 184. 16
#> 3 A1 H1 3 2.68 1.58 0.6 16.0 50.7 28.4 16.4 24.0 208. 17.2
#> 4 A1 H10 1 2.83 1.64 0.6 16.7 54.1 31.7 17.4 26.2 194. 15.6
#> 5 A1 H10 2 2.79 1.71 0.6 14.9 52.7 32.0 15.5 20.7 176. 17.6
#> 6 A1 H10 3 2.72 1.51 0.6 16.7 52.7 30.4 17.5 26.8 207. 16.8
#> 7 A1 H11 1 2.75 1.51 0.5 17.4 51.7 30.6 18.0 26.2 217. 16.8
#> 8 A1 H11 2 2.72 1.56 0.6 16.7 47.2 28.7 17.2 24.1 181. 13.6
#> 9 A1 H11 3 2.77 1.67 0.6 15.8 47.9 27.6 16.4 20.5 166. 15.2
#> 10 A1 H12 1 2.73 1.54 0.6 14.9 47.5 28.2 15.5 20.1 161. 14.8
#> # … with 146 more rows, and 5 more variables: NKR <dbl>, CDED <dbl>,
#> # PERK <dbl>, TKW <dbl>, NKE <dbl>
########### Extract or replace numbers ##########
# Extract numbers
extract_number(data_ge, GEN)
#> # A tibble: 420 × 5
#> ENV GEN REP GY HM
#> <fct> <dbl> <fct> <dbl> <dbl>
#> 1 E1 1 1 2.17 44.9
#> 2 E1 1 2 2.50 46.9
#> 3 E1 1 3 2.43 47.8
#> 4 E1 2 1 3.21 45.2
#> 5 E1 2 2 2.93 45.3
#> 6 E1 2 3 2.56 45.5
#> 7 E1 3 1 2.77 46.7
#> 8 E1 3 2 3.62 43.2
#> 9 E1 3 3 2.28 47.8
#> 10 E1 4 1 2.36 47.9
#> # … with 410 more rows
# Replace numbers
replace_number(data_ge, GEN)
#> # A tibble: 420 × 5
#> ENV GEN REP GY HM
#> <fct> <chr> <fct> <dbl> <dbl>
#> 1 E1 G 1 2.17 44.9
#> 2 E1 G 2 2.50 46.9
#> 3 E1 G 3 2.43 47.8
#> 4 E1 G 1 3.21 45.2
#> 5 E1 G 2 2.93 45.3
#> 6 E1 G 3 2.56 45.5
#> 7 E1 G 1 2.77 46.7
#> 8 E1 G 2 3.62 43.2
#> 9 E1 G 3 2.28 47.8
#> 10 E1 G 1 2.36 47.9
#> # … with 410 more rows
replace_number(data_ge,
GEN,
pattern = 1,
replacement = "_one")
#> # A tibble: 420 × 5
#> ENV GEN REP GY HM
#> <fct> <chr> <fct> <dbl> <dbl>
#> 1 E1 G_one 1 2.17 44.9
#> 2 E1 G_one 2 2.50 46.9
#> 3 E1 G_one 3 2.43 47.8
#> 4 E1 G2 1 3.21 45.2
#> 5 E1 G2 2 2.93 45.3
#> 6 E1 G2 3 2.56 45.5
#> 7 E1 G3 1 2.77 46.7
#> 8 E1 G3 2 3.62 43.2
#> 9 E1 G3 3 2.28 47.8
#> 10 E1 G4 1 2.36 47.9
#> # … with 410 more rows
########## Extract, replace or remove strings ##########
# Extract strings
extract_string(data_ge, GEN)
#> # A tibble: 420 × 5
#> ENV GEN REP GY HM
#> <fct> <chr> <fct> <dbl> <dbl>
#> 1 E1 G 1 2.17 44.9
#> 2 E1 G 2 2.50 46.9
#> 3 E1 G 3 2.43 47.8
#> 4 E1 G 1 3.21 45.2
#> 5 E1 G 2 2.93 45.3
#> 6 E1 G 3 2.56 45.5
#> 7 E1 G 1 2.77 46.7
#> 8 E1 G 2 3.62 43.2
#> 9 E1 G 3 2.28 47.8
#> 10 E1 G 1 2.36 47.9
#> # … with 410 more rows
# Replace strings
replace_string(data_ge, GEN)
#> # A tibble: 420 × 5
#> ENV GEN REP GY HM
#> <fct> <chr> <fct> <dbl> <dbl>
#> 1 E1 1 1 2.17 44.9
#> 2 E1 1 2 2.50 46.9
#> 3 E1 1 3 2.43 47.8
#> 4 E1 2 1 3.21 45.2
#> 5 E1 2 2 2.93 45.3
#> 6 E1 2 3 2.56 45.5
#> 7 E1 3 1 2.77 46.7
#> 8 E1 3 2 3.62 43.2
#> 9 E1 3 3 2.28 47.8
#> 10 E1 4 1 2.36 47.9
#> # … with 410 more rows
replace_string(data_ge,
GEN,
pattern = "G",
replacement = "GENOTYPE_")
#> # A tibble: 420 × 5
#> ENV GEN REP GY HM
#> <fct> <chr> <fct> <dbl> <dbl>
#> 1 E1 GENOTYPE_1 1 2.17 44.9
#> 2 E1 GENOTYPE_1 2 2.50 46.9
#> 3 E1 GENOTYPE_1 3 2.43 47.8
#> 4 E1 GENOTYPE_2 1 3.21 45.2
#> 5 E1 GENOTYPE_2 2 2.93 45.3
#> 6 E1 GENOTYPE_2 3 2.56 45.5
#> 7 E1 GENOTYPE_3 1 2.77 46.7
#> 8 E1 GENOTYPE_3 2 3.62 43.2
#> 9 E1 GENOTYPE_3 3 2.28 47.8
#> 10 E1 GENOTYPE_4 1 2.36 47.9
#> # … with 410 more rows
# Remove strings
remove_strings(data_ge)
#> # A tibble: 420 × 5
#> ENV GEN REP GY HM
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 1 2.17 44.9
#> 2 1 1 2 2.50 46.9
#> 3 1 1 3 2.43 47.8
#> 4 1 2 1 3.21 45.2
#> 5 1 2 2 2.93 45.3
#> 6 1 2 3 2.56 45.5
#> 7 1 3 1 2.77 46.7
#> 8 1 3 2 3.62 43.2
#> 9 1 3 3 2.28 47.8
#> 10 1 4 1 2.36 47.9
#> # … with 410 more rows
remove_strings(data_ge, ENV)
#> # A tibble: 420 × 5
#> ENV GEN REP GY HM
#> <dbl> <fct> <fct> <dbl> <dbl>
#> 1 1 G1 1 2.17 44.9
#> 2 1 G1 2 2.50 46.9
#> 3 1 G1 3 2.43 47.8
#> 4 1 G2 1 3.21 45.2
#> 5 1 G2 2 2.93 45.3
#> 6 1 G2 3 2.56 45.5
#> 7 1 G3 1 2.77 46.7
#> 8 1 G3 2 3.62 43.2
#> 9 1 G3 3 2.28 47.8
#> 10 1 G4 1 2.36 47.9
#> # … with 410 more rows
############ Find text in numeric sequences ###########
mixed_text <- data.frame(data_ge)
mixed_text[2, 4] <- "2..503"
mixed_text[3, 4] <- "3.2o75"
find_text_in_num(mixed_text, GY)
#> GY
#> [1,] 2
#> [2,] 3
############# upper, lower and title cases ############
gen_text <- c("This is the first string.", "this is the second one")
all_lower_case(gen_text)
#> [1] "this is the first string." "this is the second one"
all_upper_case(gen_text)
#> [1] "THIS IS THE FIRST STRING." "THIS IS THE SECOND ONE"
all_title_case(gen_text)
#> [1] "This Is The First String." "This Is The Second One"
first_upper_case(gen_text)
#> [1] "This is the first string." "This is the second one"
# A whole data frame
all_lower_case(data_ge)
#> # A tibble: 420 × 5
#> ENV GEN REP GY HM
#> <chr> <chr> <chr> <dbl> <dbl>
#> 1 e1 g1 1 2.17 44.9
#> 2 e1 g1 2 2.50 46.9
#> 3 e1 g1 3 2.43 47.8
#> 4 e1 g2 1 3.21 45.2
#> 5 e1 g2 2 2.93 45.3
#> 6 e1 g2 3 2.56 45.5
#> 7 e1 g3 1 2.77 46.7
#> 8 e1 g3 2 3.62 43.2
#> 9 e1 g3 3 2.28 47.8
#> 10 e1 g4 1 2.36 47.9
#> # … with 410 more rows
############### Tidy up messy text string ##############
messy_env <- c("ENV 1", "Env 1", "Env1", "env1", "Env.1", "Env_1")
tidy_strings(messy_env)
#> [1] "ENV_1" "ENV_1" "ENV_1" "ENV_1" "ENV_1" "ENV_1"
messy_gen <- c("GEN1", "gen 2", "Gen.3", "gen-4", "Gen_5", "GEN_6")
tidy_strings(messy_gen)
#> [1] "GEN_1" "GEN_2" "GEN_3" "GEN_4" "GEN_5" "GEN_6"
messy_int <- c("EnvGen", "Env_Gen", "env gen", "Env Gen", "ENV.GEN", "ENV_GEN")
tidy_strings(messy_int)
#> [1] "ENV_GEN" "ENV_GEN" "ENV_GEN" "ENV_GEN" "ENV_GEN" "ENV_GEN"
library(tibble)
# Or a whole data frame
df <- tibble(Env = messy_env,
gen = messy_gen,
Env_GEN = interaction(Env, gen),
y = rnorm(6, 300, 10))
df
#> # A tibble: 6 × 4
#> Env gen Env_GEN y
#> <chr> <chr> <fct> <dbl>
#> 1 ENV 1 GEN1 ENV 1.GEN1 305.
#> 2 Env 1 gen 2 Env 1.gen 2 302.
#> 3 Env1 Gen.3 Env1.Gen.3 292.
#> 4 env1 gen-4 env1.gen-4 318.
#> 5 Env.1 Gen_5 Env.1.Gen_5 301.
#> 6 Env_1 GEN_6 Env_1.GEN_6 309.
tidy_strings(df)
#> # A tibble: 6 × 4
#> Env gen Env_GEN y
#> <chr> <chr> <chr> <dbl>
#> 1 ENV_1 GEN_1 ENV_1_GEN_1 305.
#> 2 ENV_1 GEN_2 ENV_1_GEN_2 302.
#> 3 ENV_1 GEN_3 ENV_1_GEN_3 292.
#> 4 ENV_1 GEN_4 ENV_1_GEN_4 318.
#> 5 ENV_1 GEN_5 ENV_1_GEN_5 301.
#> 6 ENV_1 GEN_6 ENV_1_GEN_6 309.
# }