Solusi cepat dan ringkas tidyverse
: (lebih dari dua kali lebih cepat dari Base R read.csv
)
tbl <-
list.files(pattern = "*.csv") %>%
map_df(~read_csv(.))
dan data.table 's fread()
bahkan dapat memotong-saat beban setengah lagi. (untuk 1/4 kali Base R )
library(data.table)
tbl_fread <-
list.files(pattern = "*.csv") %>%
map_df(~fread(.))
The stringsAsFactors = FALSE
argumen menjaga bebas faktor dataframe, (dan sebagai marbel poin, adalah pengaturan default untuk fread
)
Jika typecasting sedang kurang ajar, Anda bisa memaksa semua kolom sebagai karakter dengan col_types
argumen.
tbl <-
list.files(pattern = "*.csv") %>%
map_df(~read_csv(., col_types = cols(.default = "c")))
Jika Anda ingin masuk ke dalam subdirektori untuk membuat daftar file yang pada akhirnya akan diikat, maka pastikan untuk memasukkan nama path, serta daftarkan file dengan nama lengkapnya di daftar Anda. Ini akan memungkinkan pekerjaan mengikat untuk pergi di luar direktori saat ini. (Memikirkan nama path lengkap sebagai operasi seperti paspor untuk memungkinkan pergerakan kembali melintasi 'batas' direktori.)
tbl <-
list.files(path = "./subdirectory/",
pattern = "*.csv",
full.names = T) %>%
map_df(~read_csv(., col_types = cols(.default = "c")))
Seperti yang dijelaskan Hadley di sini (sekitar setengah jalan):
map_df(x, f)
secara efektif sama dengan do.call("rbind", lapply(x, f))
....
Fitur Bonus - menambahkan nama file ke catatan per permintaan fitur Niks dalam komentar di bawah ini:
* Tambahkan asli filename
ke setiap catatan.
Kode menjelaskan: membuat fungsi untuk menambahkan nama file ke setiap catatan selama pembacaan awal tabel. Kemudian gunakan fungsi itu alih-alih read_csv()
fungsi sederhana .
read_plus <- function(flnm) {
read_csv(flnm) %>%
mutate(filename = flnm)
}
tbl_with_sources <-
list.files(pattern = "*.csv",
full.names = T) %>%
map_df(~read_plus(.))
(Pendekatan typecasting dan penanganan subdirektori juga dapat ditangani di dalam read_plus()
fungsi dengan cara yang sama seperti yang diilustrasikan dalam varian kedua dan ketiga yang disarankan di atas.)
### Benchmark Code & Results
library(tidyverse)
library(data.table)
library(microbenchmark)
### Base R Approaches
#### Instead of a dataframe, this approach creates a list of lists
#### removed from analysis as this alone doubled analysis time reqd
# lapply_read.delim <- function(path, pattern = "*.csv") {
# temp = list.files(path, pattern, full.names = TRUE)
# myfiles = lapply(temp, read.delim)
# }
#### `read.csv()`
do.call_rbind_read.csv <- function(path, pattern = "*.csv") {
files = list.files(path, pattern, full.names = TRUE)
do.call(rbind, lapply(files, function(x) read.csv(x, stringsAsFactors = FALSE)))
}
map_df_read.csv <- function(path, pattern = "*.csv") {
list.files(path, pattern, full.names = TRUE) %>%
map_df(~read.csv(., stringsAsFactors = FALSE))
}
### *dplyr()*
#### `read_csv()`
lapply_read_csv_bind_rows <- function(path, pattern = "*.csv") {
files = list.files(path, pattern, full.names = TRUE)
lapply(files, read_csv) %>% bind_rows()
}
map_df_read_csv <- function(path, pattern = "*.csv") {
list.files(path, pattern, full.names = TRUE) %>%
map_df(~read_csv(., col_types = cols(.default = "c")))
}
### *data.table* / *purrr* hybrid
map_df_fread <- function(path, pattern = "*.csv") {
list.files(path, pattern, full.names = TRUE) %>%
map_df(~fread(.))
}
### *data.table*
rbindlist_fread <- function(path, pattern = "*.csv") {
files = list.files(path, pattern, full.names = TRUE)
rbindlist(lapply(files, function(x) fread(x)))
}
do.call_rbind_fread <- function(path, pattern = "*.csv") {
files = list.files(path, pattern, full.names = TRUE)
do.call(rbind, lapply(files, function(x) fread(x, stringsAsFactors = FALSE)))
}
read_results <- function(dir_size){
microbenchmark(
# lapply_read.delim = lapply_read.delim(dir_size), # too slow to include in benchmarks
do.call_rbind_read.csv = do.call_rbind_read.csv(dir_size),
map_df_read.csv = map_df_read.csv(dir_size),
lapply_read_csv_bind_rows = lapply_read_csv_bind_rows(dir_size),
map_df_read_csv = map_df_read_csv(dir_size),
rbindlist_fread = rbindlist_fread(dir_size),
do.call_rbind_fread = do.call_rbind_fread(dir_size),
map_df_fread = map_df_fread(dir_size),
times = 10L)
}
read_results_lrg_mid_mid <- read_results('./testFolder/500MB_12.5MB_40files')
print(read_results_lrg_mid_mid, digits = 3)
read_results_sml_mic_mny <- read_results('./testFolder/5MB_5KB_1000files/')
read_results_sml_tny_mod <- read_results('./testFolder/5MB_50KB_100files/')
read_results_sml_sml_few <- read_results('./testFolder/5MB_500KB_10files/')
read_results_med_sml_mny <- read_results('./testFolder/50MB_5OKB_1000files')
read_results_med_sml_mod <- read_results('./testFolder/50MB_5OOKB_100files')
read_results_med_med_few <- read_results('./testFolder/50MB_5MB_10files')
read_results_lrg_sml_mny <- read_results('./testFolder/500MB_500KB_1000files')
read_results_lrg_med_mod <- read_results('./testFolder/500MB_5MB_100files')
read_results_lrg_lrg_few <- read_results('./testFolder/500MB_50MB_10files')
read_results_xlg_lrg_mod <- read_results('./testFolder/5000MB_50MB_100files')
print(read_results_sml_mic_mny, digits = 3)
print(read_results_sml_tny_mod, digits = 3)
print(read_results_sml_sml_few, digits = 3)
print(read_results_med_sml_mny, digits = 3)
print(read_results_med_sml_mod, digits = 3)
print(read_results_med_med_few, digits = 3)
print(read_results_lrg_sml_mny, digits = 3)
print(read_results_lrg_med_mod, digits = 3)
print(read_results_lrg_lrg_few, digits = 3)
print(read_results_xlg_lrg_mod, digits = 3)
# display boxplot of my typical use case results & basic machine max load
par(oma = c(0,0,0,0)) # remove overall margins if present
par(mfcol = c(1,1)) # remove grid if present
par(mar = c(12,5,1,1) + 0.1) # to display just a single boxplot with its complete labels
boxplot(read_results_lrg_mid_mid, las = 2, xlab = "", ylab = "Duration (seconds)", main = "40 files @ 12.5MB (500MB)")
boxplot(read_results_xlg_lrg_mod, las = 2, xlab = "", ylab = "Duration (seconds)", main = "100 files @ 50MB (5GB)")
# generate 3x3 grid boxplots
par(oma = c(12,1,1,1)) # margins for the whole 3 x 3 grid plot
par(mfcol = c(3,3)) # create grid (filling down each column)
par(mar = c(1,4,2,1)) # margins for the individual plots in 3 x 3 grid
boxplot(read_results_sml_mic_mny, las = 2, xlab = "", ylab = "Duration (seconds)", main = "1000 files @ 5KB (5MB)", xaxt = 'n')
boxplot(read_results_sml_tny_mod, las = 2, xlab = "", ylab = "Duration (milliseconds)", main = "100 files @ 50KB (5MB)", xaxt = 'n')
boxplot(read_results_sml_sml_few, las = 2, xlab = "", ylab = "Duration (milliseconds)", main = "10 files @ 500KB (5MB)",)
boxplot(read_results_med_sml_mny, las = 2, xlab = "", ylab = "Duration (microseconds) ", main = "1000 files @ 50KB (50MB)", xaxt = 'n')
boxplot(read_results_med_sml_mod, las = 2, xlab = "", ylab = "Duration (microseconds)", main = "100 files @ 500KB (50MB)", xaxt = 'n')
boxplot(read_results_med_med_few, las = 2, xlab = "", ylab = "Duration (seconds)", main = "10 files @ 5MB (50MB)")
boxplot(read_results_lrg_sml_mny, las = 2, xlab = "", ylab = "Duration (seconds)", main = "1000 files @ 500KB (500MB)", xaxt = 'n')
boxplot(read_results_lrg_med_mod, las = 2, xlab = "", ylab = "Duration (seconds)", main = "100 files @ 5MB (500MB)", xaxt = 'n')
boxplot(read_results_lrg_lrg_few, las = 2, xlab = "", ylab = "Duration (seconds)", main = "10 files @ 50MB (500MB)")
Kasus Penggunaan Middling
Kasing Penggunaan Lebih Besar
Berbagai Kasus Penggunaan
Baris: jumlah file (1000, 100, 10)
Kolom: ukuran kerangka data akhir (5MB, 50MB, 500MB)
(klik pada gambar untuk melihat ukuran asli)
Hasil R dasar lebih baik untuk kasus penggunaan terkecil di mana overhead membawa perpustakaan C purrr dan dplyr untuk menanggung lebih besar daripada keuntungan kinerja yang diamati ketika melakukan tugas pemrosesan skala yang lebih besar.
jika Anda ingin menjalankan tes Anda sendiri, Anda mungkin menemukan skrip bash ini bermanfaat.
for ((i=1; i<=$2; i++)); do
cp "$1" "${1:0:8}_${i}.csv";
done
bash what_you_name_this_script.sh "fileName_you_want_copied" 100
akan membuat 100 salinan file Anda diberi nomor urut (setelah 8 karakter awal nama file dan garis bawah).
Atribusi dan Penghargaan
Dengan terima kasih khusus kepada:
- Tyler Rinker dan Akrun karena menunjukkan microbenchmark.
- Jake Kaupp untuk memperkenalkan saya ke
map_df()
sini .
- David McLaughlin untuk umpan balik yang bermanfaat dalam meningkatkan visualisasi dan mendiskusikan / mengkonfirmasi inversi kinerja yang diamati dalam file kecil, hasil analisis dataframe kecil.
- marbel untuk menunjukkan perilaku default untuk
fread()
. (Saya perlu belajar data.table
.)