Utilities for handling with numbers and strings

all_lower_case(): Translate all non-numeric strings of a data frame to lower case.
all_upper_case(): Translate all non-numeric strings of a data frame to upper case.
all_title_case(): Translate all non-numeric strings of a data frame to title case.
first_upper_case: Translate the first word of a string to upper case.
extract_number(): Extract the number(s) of a string.
extract_string(): Extract all strings, ignoring case.
find_text_in_num(): Find text characters in a numeric sequence and return the row index.
has_text_in_num(): Inspect columns looking for text in numeric sequence and return a warning if text is found.
remove_space(): Remove all blank spaces of a string.
remove_strings(): Remove all strings of a variable.
replace_number(): Replace numbers with a replacement.
replace_string(): Replace all strings with a replacement, ignoring case.
round_cols(): Round a selected column or a whole data frame to significant figures.
tidy_strings(): Tidy up characters strings, non-numeric columns, or any selected columns in a data frame by putting all word in upper case, replacing any space, tabulation, punctuation characters by '_', and putting '_' between lower and upper case. Suppose that str = c("Env1", "env 1", "env.1") (which by definition should represent a unique level in plant breeding trials, e.g., environment 1) is subjected to tidy_strings(str): the result will be then c("ENV_1", "ENV_1", "ENV_1"). See Examples section for more examples.

Usage

all_upper_case(.data, ...)

all_lower_case(.data, ...)

all_title_case(.data, ...)

first_upper_case(.data, ...)

extract_number(.data, ..., pattern = NULL)

extract_string(.data, ..., pattern = NULL)

find_text_in_num(.data, ...)

has_text_in_num(.data)

remove_space(.data, ...)

remove_strings(.data, ...)

replace_number(
  .data,
  ...,
  pattern = NULL,
  replacement = "",
  ignore_case = FALSE
)

replace_string(
  .data,
  ...,
  pattern = NULL,
  replacement = "",
  ignore_case = FALSE
)

round_cols(.data, ..., digits = 2)

tidy_strings(.data, ..., sep = "_")

Arguments

.data

A data frame

...

The argument depends on the function used.

For round_cols() ... are the variables to round. If no variable is informed, all the numeric variables from data are used.
For all_lower_case(), all_upper_case(), all_title_case(), stract_number(), stract_string(), remove_strings(), and tidy_strings() ... are the variables to apply the function. If no variable is informed, the function will be applied to all non-numeric variables in .data.

pattern

A string to be matched. Regular Expression Syntax is also allowed.

replacement

A string for replacement.

ignore_case

If FALSE (default), the pattern matching is case sensitive and if TRUE, case is ignored during matching.

digits

The number of significant figures.

sep

A character string to separate the terms. Defaults to "_".

Author

Tiago Olivoto tiagoolivoto@gmail.com

Examples

# \donttest{
library(metan)

################ Rounding numbers ###############
# All numeric columns
round_cols(data_ge2, digits = 1)
#> # A tibble: 156 × 18
#>    ENV   GEN   REP      PH    EH    EP    EL    ED    CL    CD    CW    KW    NR
#>    <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#>  1 A1    H1    1       2.6   1.7   0.7  16.1  52.2  28.1  16.3  25.1  216.  15.6
#>  2 A1    H1    2       2.9   1.8   0.6  14.2  50.3  27.6  14.5  21.4  184.  16  
#>  3 A1    H1    3       2.7   1.6   0.6  16    50.7  28.4  16.4  24    208.  17.2
#>  4 A1    H10   1       2.8   1.6   0.6  16.7  54.1  31.7  17.4  26.2  194.  15.6
#>  5 A1    H10   2       2.8   1.7   0.6  14.9  52.7  32    15.5  20.7  176.  17.6
#>  6 A1    H10   3       2.7   1.5   0.6  16.7  52.7  30.4  17.5  26.8  207.  16.8
#>  7 A1    H11   1       2.8   1.5   0.5  17.4  51.7  30.6  18    26.2  217.  16.8
#>  8 A1    H11   2       2.7   1.6   0.6  16.7  47.2  28.7  17.2  24.1  181.  13.6
#>  9 A1    H11   3       2.8   1.7   0.6  15.8  47.9  27.6  16.4  20.5  166.  15.2
#> 10 A1    H12   1       2.7   1.5   0.6  14.9  47.5  28.2  15.5  20.1  161   14.8
#> # … with 146 more rows, and 5 more variables: NKR <dbl>, CDED <dbl>,
#> #   PERK <dbl>, TKW <dbl>, NKE <dbl>

# Round specific columns
round_cols(data_ge2, EP, digits = 1)
#> # A tibble: 156 × 18
#>    ENV   GEN   REP      PH    EH    EP    EL    ED    CL    CD    CW    KW    NR
#>    <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#>  1 A1    H1    1      2.61  1.71   0.7  16.1  52.2  28.1  16.3  25.1  217.  15.6
#>  2 A1    H1    2      2.87  1.76   0.6  14.2  50.3  27.6  14.5  21.4  184.  16  
#>  3 A1    H1    3      2.68  1.58   0.6  16.0  50.7  28.4  16.4  24.0  208.  17.2
#>  4 A1    H10   1      2.83  1.64   0.6  16.7  54.1  31.7  17.4  26.2  194.  15.6
#>  5 A1    H10   2      2.79  1.71   0.6  14.9  52.7  32.0  15.5  20.7  176.  17.6
#>  6 A1    H10   3      2.72  1.51   0.6  16.7  52.7  30.4  17.5  26.8  207.  16.8
#>  7 A1    H11   1      2.75  1.51   0.5  17.4  51.7  30.6  18.0  26.2  217.  16.8
#>  8 A1    H11   2      2.72  1.56   0.6  16.7  47.2  28.7  17.2  24.1  181.  13.6
#>  9 A1    H11   3      2.77  1.67   0.6  15.8  47.9  27.6  16.4  20.5  166.  15.2
#> 10 A1    H12   1      2.73  1.54   0.6  14.9  47.5  28.2  15.5  20.1  161.  14.8
#> # … with 146 more rows, and 5 more variables: NKR <dbl>, CDED <dbl>,
#> #   PERK <dbl>, TKW <dbl>, NKE <dbl>

########### Extract or replace numbers ##########
# Extract numbers
extract_number(data_ge, GEN)
#> # A tibble: 420 × 5
#>    ENV     GEN REP      GY    HM
#>    <fct> <dbl> <fct> <dbl> <dbl>
#>  1 E1        1 1      2.17  44.9
#>  2 E1        1 2      2.50  46.9
#>  3 E1        1 3      2.43  47.8
#>  4 E1        2 1      3.21  45.2
#>  5 E1        2 2      2.93  45.3
#>  6 E1        2 3      2.56  45.5
#>  7 E1        3 1      2.77  46.7
#>  8 E1        3 2      3.62  43.2
#>  9 E1        3 3      2.28  47.8
#> 10 E1        4 1      2.36  47.9
#> # … with 410 more rows
# Replace numbers
replace_number(data_ge, GEN)
#> # A tibble: 420 × 5
#>    ENV   GEN   REP      GY    HM
#>    <fct> <chr> <fct> <dbl> <dbl>
#>  1 E1    G     1      2.17  44.9
#>  2 E1    G     2      2.50  46.9
#>  3 E1    G     3      2.43  47.8
#>  4 E1    G     1      3.21  45.2
#>  5 E1    G     2      2.93  45.3
#>  6 E1    G     3      2.56  45.5
#>  7 E1    G     1      2.77  46.7
#>  8 E1    G     2      3.62  43.2
#>  9 E1    G     3      2.28  47.8
#> 10 E1    G     1      2.36  47.9
#> # … with 410 more rows
replace_number(data_ge,
               GEN,
               pattern = 1,
               replacement = "_one")
#> # A tibble: 420 × 5
#>    ENV   GEN   REP      GY    HM
#>    <fct> <chr> <fct> <dbl> <dbl>
#>  1 E1    G_one 1      2.17  44.9
#>  2 E1    G_one 2      2.50  46.9
#>  3 E1    G_one 3      2.43  47.8
#>  4 E1    G2    1      3.21  45.2
#>  5 E1    G2    2      2.93  45.3
#>  6 E1    G2    3      2.56  45.5
#>  7 E1    G3    1      2.77  46.7
#>  8 E1    G3    2      3.62  43.2
#>  9 E1    G3    3      2.28  47.8
#> 10 E1    G4    1      2.36  47.9
#> # … with 410 more rows

########## Extract, replace or remove strings ##########
# Extract strings
extract_string(data_ge, GEN)
#> # A tibble: 420 × 5
#>    ENV   GEN   REP      GY    HM
#>    <fct> <chr> <fct> <dbl> <dbl>
#>  1 E1    G     1      2.17  44.9
#>  2 E1    G     2      2.50  46.9
#>  3 E1    G     3      2.43  47.8
#>  4 E1    G     1      3.21  45.2
#>  5 E1    G     2      2.93  45.3
#>  6 E1    G     3      2.56  45.5
#>  7 E1    G     1      2.77  46.7
#>  8 E1    G     2      3.62  43.2
#>  9 E1    G     3      2.28  47.8
#> 10 E1    G     1      2.36  47.9
#> # … with 410 more rows

# Replace strings
replace_string(data_ge, GEN)
#> # A tibble: 420 × 5
#>    ENV   GEN   REP      GY    HM
#>    <fct> <chr> <fct> <dbl> <dbl>
#>  1 E1    1     1      2.17  44.9
#>  2 E1    1     2      2.50  46.9
#>  3 E1    1     3      2.43  47.8
#>  4 E1    2     1      3.21  45.2
#>  5 E1    2     2      2.93  45.3
#>  6 E1    2     3      2.56  45.5
#>  7 E1    3     1      2.77  46.7
#>  8 E1    3     2      3.62  43.2
#>  9 E1    3     3      2.28  47.8
#> 10 E1    4     1      2.36  47.9
#> # … with 410 more rows
replace_string(data_ge,
               GEN,
               pattern = "G",
               replacement = "GENOTYPE_")
#> # A tibble: 420 × 5
#>    ENV   GEN        REP      GY    HM
#>    <fct> <chr>      <fct> <dbl> <dbl>
#>  1 E1    GENOTYPE_1 1      2.17  44.9
#>  2 E1    GENOTYPE_1 2      2.50  46.9
#>  3 E1    GENOTYPE_1 3      2.43  47.8
#>  4 E1    GENOTYPE_2 1      3.21  45.2
#>  5 E1    GENOTYPE_2 2      2.93  45.3
#>  6 E1    GENOTYPE_2 3      2.56  45.5
#>  7 E1    GENOTYPE_3 1      2.77  46.7
#>  8 E1    GENOTYPE_3 2      3.62  43.2
#>  9 E1    GENOTYPE_3 3      2.28  47.8
#> 10 E1    GENOTYPE_4 1      2.36  47.9
#> # … with 410 more rows

# Remove strings
remove_strings(data_ge)
#> # A tibble: 420 × 5
#>      ENV   GEN   REP    GY    HM
#>    <dbl> <dbl> <dbl> <dbl> <dbl>
#>  1     1     1     1  2.17  44.9
#>  2     1     1     2  2.50  46.9
#>  3     1     1     3  2.43  47.8
#>  4     1     2     1  3.21  45.2
#>  5     1     2     2  2.93  45.3
#>  6     1     2     3  2.56  45.5
#>  7     1     3     1  2.77  46.7
#>  8     1     3     2  3.62  43.2
#>  9     1     3     3  2.28  47.8
#> 10     1     4     1  2.36  47.9
#> # … with 410 more rows
remove_strings(data_ge, ENV)
#> # A tibble: 420 × 5
#>      ENV GEN   REP      GY    HM
#>    <dbl> <fct> <fct> <dbl> <dbl>
#>  1     1 G1    1      2.17  44.9
#>  2     1 G1    2      2.50  46.9
#>  3     1 G1    3      2.43  47.8
#>  4     1 G2    1      3.21  45.2
#>  5     1 G2    2      2.93  45.3
#>  6     1 G2    3      2.56  45.5
#>  7     1 G3    1      2.77  46.7
#>  8     1 G3    2      3.62  43.2
#>  9     1 G3    3      2.28  47.8
#> 10     1 G4    1      2.36  47.9
#> # … with 410 more rows


############ Find text in numeric sequences ###########
mixed_text <- data.frame(data_ge)
mixed_text[2, 4] <- "2..503"
mixed_text[3, 4] <- "3.2o75"
find_text_in_num(mixed_text, GY)
#>      GY
#> [1,]  2
#> [2,]  3

############# upper, lower and title cases ############
gen_text <- c("This is the first string.", "this is the second one")
all_lower_case(gen_text)
#> [1] "this is the first string." "this is the second one"   
all_upper_case(gen_text)
#> [1] "THIS IS THE FIRST STRING." "THIS IS THE SECOND ONE"   
all_title_case(gen_text)
#> [1] "This Is The First String." "This Is The Second One"   
first_upper_case(gen_text)
#> [1] "This is the first string." "This is the second one"   

# A whole data frame
all_lower_case(data_ge)
#> # A tibble: 420 × 5
#>    ENV   GEN   REP      GY    HM
#>    <chr> <chr> <chr> <dbl> <dbl>
#>  1 e1    g1    1      2.17  44.9
#>  2 e1    g1    2      2.50  46.9
#>  3 e1    g1    3      2.43  47.8
#>  4 e1    g2    1      3.21  45.2
#>  5 e1    g2    2      2.93  45.3
#>  6 e1    g2    3      2.56  45.5
#>  7 e1    g3    1      2.77  46.7
#>  8 e1    g3    2      3.62  43.2
#>  9 e1    g3    3      2.28  47.8
#> 10 e1    g4    1      2.36  47.9
#> # … with 410 more rows


############### Tidy up messy text string ##############
messy_env <- c("ENV 1", "Env   1", "Env1", "env1", "Env.1", "Env_1")
tidy_strings(messy_env)
#> [1] "ENV_1" "ENV_1" "ENV_1" "ENV_1" "ENV_1" "ENV_1"

messy_gen <- c("GEN1", "gen 2", "Gen.3", "gen-4", "Gen_5", "GEN_6")
tidy_strings(messy_gen)
#> [1] "GEN_1" "GEN_2" "GEN_3" "GEN_4" "GEN_5" "GEN_6"

messy_int <- c("EnvGen", "Env_Gen", "env gen", "Env Gen", "ENV.GEN", "ENV_GEN")
tidy_strings(messy_int)
#> [1] "ENV_GEN" "ENV_GEN" "ENV_GEN" "ENV_GEN" "ENV_GEN" "ENV_GEN"

library(tibble)
# Or a whole data frame
df <- tibble(Env = messy_env,
             gen = messy_gen,
             Env_GEN = interaction(Env, gen),
             y = rnorm(6, 300, 10))
df
#> # A tibble: 6 × 4
#>   Env     gen   Env_GEN           y
#>   <chr>   <chr> <fct>         <dbl>
#> 1 ENV 1   GEN1  ENV 1.GEN1     305.
#> 2 Env   1 gen 2 Env   1.gen 2  302.
#> 3 Env1    Gen.3 Env1.Gen.3     292.
#> 4 env1    gen-4 env1.gen-4     318.
#> 5 Env.1   Gen_5 Env.1.Gen_5    301.
#> 6 Env_1   GEN_6 Env_1.GEN_6    309.
tidy_strings(df)
#> # A tibble: 6 × 4
#>   Env   gen   Env_GEN         y
#>   <chr> <chr> <chr>       <dbl>
#> 1 ENV_1 GEN_1 ENV_1_GEN_1  305.
#> 2 ENV_1 GEN_2 ENV_1_GEN_2  302.
#> 3 ENV_1 GEN_3 ENV_1_GEN_3  292.
#> 4 ENV_1 GEN_4 ENV_1_GEN_4  318.
#> 5 ENV_1 GEN_5 ENV_1_GEN_5  301.
#> 6 ENV_1 GEN_6 ENV_1_GEN_6  309.
# }