BazerData.jl

Data manipulation utilities for Julia
Log | Files | Refs | README | LICENSE

tabulate.jl (6541B)


      1 @testset "Tabulate" begin
      2 
      3     # on existing dataset
      4     df = dropmissing(DataFrame(PalmerPenguins.load()))
      5     cols = :island
      6 
      7     # Test that function do not error on empty
      8     @test isnothing(tabulate(df[ df.island .== "Brehat", :], :sex))
      9 
     10     col_length = combine(groupby(df, cols), cols .=> length => :_N)
     11     sort!(col_length, cols)
     12     col_tab = tabulate(df, :island; out=:df);
     13     sort!(col_tab, cols)
     14     @test col_length._N == col_tab.freq
     15 
     16     # test the string output
     17     tab_buf = IOBuffer(tabulate(df, :island; out=:string))
     18     tab_string = String(take!(tab_buf))
     19     @test count(==('\n'), tab_string) == 5 # test number of lines expected
     20     first_line = split(tab_string, '\n', limit=2)[1]
     21     @test all(x -> contains(first_line, x), ["island", "Freq", "Percent", "Cum", "Hist."])
     22 
     23     tab_buf = IOBuffer(tabulate(df, :island; out=:string, skip_stat=:freq_hist))
     24     tab_string = String(take!(tab_buf))
     25     @test count(==('\n'), tab_string) == 5 # test number of lines expected
     26     first_line = split(tab_string, '\n', limit=2)[1]
     27     @test all(x -> contains(first_line, x), ["island", "Freq", "Percent", "Cum"])
     28 
     29     # test the nothing output
     30     tab_stdout = tabulate(df, :island, out=:stdout)
     31     @test typeof(tab_stdout) == Nothing
     32     tab_stdout = stdout_string() do # had to request a convenient package for this one...
     33         tabulate(df, :island, out=:stdout)
     34     end
     35     @test count(==('\n'), tab_stdout) == 5 # test number of lines expected
     36     first_line = split(tab_stdout, '\n', limit=2)[1]
     37     @test all(x -> contains(first_line, x), ["island", "Freq", "Percent", "Cum", "Hist."])
     38 
     39     # test the type columns get properly passed
     40     @test contains(tabulate(df, [:island, :species], group_type = [:type, :value], out=:string),
     41                    "island_typeof")
     42     @test contains(tabulate(df, [:island, :species], group_type = [:value, :type], out=:string),
     43                    "species_typeof")
     44 
     45     # test the twoway ad wide tabulate
     46     df_twoway = tabulate(df, [:island, :species], format_tbl=:wide, out=:df);
     47     @test names(df_twoway) == ["-", "Adelie", "Gentoo", "Chinstrap", "Total by island"]
     48     @test nrow(df_twoway) == 4
     49     df_twoway = tabulate(df, [:sex, :island, :species], format_tbl=:wide, out=:df);
     50     @test names(df_twoway) == ["-", "--", "Adelie", "Gentoo", "Chinstrap", "Total by sex, island"]
     51     @test nrow(df_twoway) == 7
     52 
     53     # on a specific dataset (see issue #1)
     54     df = DataFrame(x = [1, 2, 5, "NA", missing], y = ["a", "c", "b", "e", "d"])
     55     df_tab = tabulate(df, :x, reorder_cols=true, out=:df)
     56     @test isequal(df_tab.x, df.x)
     57 
     58     # test the group type options
     59     df = DataFrame(x = [1, 2, 2, "NA", missing], y = ["c", "c", "b", "z", "d"])
     60     @test isequal(
     61         tabulate(df, [:x, :y], out=:df).y,
     62         sort(df.y))
     63     @test nrow(tabulate(df, [:x, :y], group_type = :value, out=:df)) == 5
     64     @test nrow(tabulate(df, [:x, :y], group_type = :type, out=:df)) == 3
     65     @test nrow(tabulate(df, [:x, :y], group_type = [:type, :value], out=:df)) == 4
     66     @test nrow(tabulate(df, [:x, :y], group_type = [:value, :type], out=:df)) == 4
     67 
     68 end
     69 
     70 
     71 @testset "Tabulate - wide format pct" begin
     72     df = dropmissing(DataFrame(PalmerPenguins.load()))
     73 
     74     # wide format with format_stat=:pct returns a DataFrame
     75     df_pct = tabulate(df, [:island, :species], format_tbl=:wide, format_stat=:pct, out=:df)
     76     @test df_pct isa DataFrame
     77     @test nrow(df_pct) == 3
     78     # pct columns should not have a totals column (unlike freq)
     79     @test !any(contains.(names(df_pct), "Total"))
     80 
     81     # wide format pct as string output
     82     pt = tabulate(df, [:island, :species], format_tbl=:wide, format_stat=:pct, out=:string)
     83     @test pt isa String
     84     @test length(pt) > 0
     85 
     86     # wide format pct stdout returns nothing
     87     result = tabulate(df, [:island, :species], format_tbl=:wide, format_stat=:pct, out=:stdout)
     88     @test isnothing(result)
     89 end
     90 
     91 
     92 @testset "Tabulate - wide format string output" begin
     93     df = dropmissing(DataFrame(PalmerPenguins.load()))
     94 
     95     # wide freq as string
     96     pt = tabulate(df, [:island, :species], format_tbl=:wide, out=:string)
     97     @test pt isa String
     98     @test contains(pt, "Adelie")
     99     @test contains(pt, "Gentoo")
    100     @test contains(pt, "Chinstrap")
    101 
    102     # 3-column wide as string
    103     pt = tabulate(df, [:sex, :island, :species], format_tbl=:wide, out=:string)
    104     @test pt isa String
    105     @test contains(pt, "Adelie")
    106 end
    107 
    108 
    109 @testset "Tabulate - missing values" begin
    110     # DataFrame with missing values in the tabulated column
    111     df = DataFrame(x = [1, 2, missing, 1, missing, 3])
    112     df_tab = tabulate(df, :x, out=:df)
    113     @test nrow(df_tab) == 4  # 1, 2, 3, missing
    114     @test sum(df_tab.freq) == 6
    115     @test :freq in propertynames(df_tab)
    116     @test :pct in propertynames(df_tab)
    117     @test :cum in propertynames(df_tab)
    118 
    119     # string output with missing values should not error
    120     pt = tabulate(df, :x, out=:string)
    121     @test pt isa String
    122     @test contains(pt, "missing")
    123 
    124     # two-column with missing
    125     df = DataFrame(x = ["a", "b", missing, "a"], y = [1, 2, 3, missing])
    126     df_tab = tabulate(df, [:x, :y], out=:df)
    127     @test nrow(df_tab) == 4
    128     @test sum(df_tab.freq) == 4
    129 end
    130 
    131 
    132 @testset "Tabulate - skip_stat vector" begin
    133     df = dropmissing(DataFrame(PalmerPenguins.load()))
    134 
    135     # skip multiple stats
    136     pt = tabulate(df, :island, out=:string, skip_stat=[:freq_hist, :cum])
    137     first_line = split(pt, '\n', limit=2)[1]
    138     @test contains(first_line, "Freq")
    139     @test contains(first_line, "Percent")
    140     @test !contains(first_line, "Cum")
    141     @test !contains(first_line, "Hist")
    142 
    143     # skip just freq
    144     pt = tabulate(df, :island, out=:string, skip_stat=:freq)
    145     first_line = split(pt, '\n', limit=2)[1]
    146     @test !contains(first_line, "Freq.")
    147     @test contains(first_line, "Percent")
    148 end
    149 
    150 
    151 @testset "Tabulate - single row DataFrame" begin
    152     df = DataFrame(x = ["only_value"])
    153     df_tab = tabulate(df, :x, out=:df)
    154     @test nrow(df_tab) == 1
    155     @test df_tab.freq[1] == 1
    156     @test df_tab.cum[1] == 100
    157 end
    158 
    159 
    160 @testset "Tabulate - reorder_cols=false" begin
    161     df = DataFrame(x = ["c", "a", "b", "a", "c", "c"])
    162     df_tab = tabulate(df, :x, reorder_cols=false, out=:df)
    163     # without reordering, original groupby order is preserved
    164     @test nrow(df_tab) == 3
    165     @test sum(df_tab.freq) == 6
    166 end
    167 
    168 
    169 @testset "Tabulate - invalid format_stat in wide" begin
    170     df = dropmissing(DataFrame(PalmerPenguins.load()))
    171     @test_throws Exception tabulate(df, [:island, :species],
    172         format_tbl=:wide, format_stat=:invalid, out=:df)
    173 end