Мне нужно заменить имена студентов внутри более длинного файла, используя более короткий файл сопоставления. Более длинный файл содержит только имена и имена файлов экзаменов. Файл сопоставления имеет полные имена и имя файла исследования. Мне нужно получить полные имена из файла сопоставления. Должны быть соблюдены два условия: имя учащегося должно совпадать в обоих файлах, а поскольку у нескольких учащихся одно и то же имя, имя файла экзамена также должно совпадать.
student_file_mapping <- data.frame(student=c("paul_johns","mary_bash","paul_simons"), file=c("johns bash hunter 2022_exam1.paul_johns.csv","johns bash hunter 2022_exam1.mary_bash.csv","nichols simons smith 2022_exam1.paul_simonsh.csv"))
> student_file_mapping
student file
1 paul_johns johns bash hunter 2022_exam1.paul_johns.csv
2 mary_bash johns bash hunter 2022_exam1.mary_bash.csv
3 paul_simons nichols simons smith 2022_exam1.paul_simonsh.csv
data_full <- data.frame(student_file=c("paul_johns bash hunter 2022_exam1","paul_johns bash hunter 2022_exam1","paul_johns bash hunter 2022_exam1","mary_johns bash hunter 2022_exam1","mary_johns bash hunter 2022_exam1","mary_johns bash hunter 2022_exam1","paul_nichols simons smith 2022_exam1","paul_nichols simons smith 2022_exam1","paul_nichols simons smith 2022_exam1","paul_johns bash hunter 2022_exam1","mary_johns bash hunter 2022_exam1","paul_nichols simons smith 2022_exam1"), File=c("johns bash hunter 2022_exam1.csv", "johns bash hunter 2022_exam1.csv","johns bash hunter 2022_exam1.csv","johns bash hunter 2022_exam1.csv","johns bash hunter 2022_exam1.csv","johns bash hunter 2022_exam1.csv","nichols simons smith 2022_exam1.csv","nichols simons smith 2022_exam1.csv","nichols simons smith 2022_exam1.csv","jons bash hunter 2022_exam1.csv","johns bash hunter 2022_exam1.csv","nichols simons smith 2022_exam1.csv"))
> data_full
student_file File
1 paul_johns bash hunter 2022_exam1 johns bash hunter 2022_exam1.csv
2 paul_johns bash hunter 2022_exam1 johns bash hunter 2022_exam1.csv
3 paul_johns bash hunter 2022_exam1 johns bash hunter 2022_exam1.csv
4 mary_johns bash hunter 2022_exam1 johns bash hunter 2022_exam1.csv
5 mary_johns bash hunter 2022_exam1 johns bash hunter 2022_exam1.csv
6 mary_johns bash hunter 2022_exam1 johns bash hunter 2022_exam1.csv
7 paul_nichols simons smith 2022_exam1 nichols simons smith 2022_exam1.csv
8 paul_nichols simons smith 2022_exam1 nichols simons smith 2022_exam1.csv
9 paul_nichols simons smith 2022_exam1 nichols simons smith 2022_exam1.csv
10 paul_johns bash hunter 2022_exam1 jons bash hunter 2022_exam1.csv
11 mary_johns bash hunter 2022_exam1 johns bash hunter 2022_exam1.csv
12 paul_nichols simons smith 2022_exam1 nichols simons smith 2022_exam1.csv
Я могу получить имена учеников из обоих файлов
str_split_i(student_file_mapping$student,"_",1)[1] -> paul
str_split_i(data_full$student_file,"_",1)[1] -> paul
Имена файлов исследования
str_split_i(student_file_mapping$file,"\\.",1)[1] -> johns bash hunter 2022_exam1
str_split_i(data_full$File,".csv",1)[1] -> johns bash hunter 2022_exam1
Если я вставлю их в ifelse, я получу правильный вывод только для одной строки на каждого учащегося в длинном файле.
data_full$student <- ifelse( str_split_i(student_file_mapping$file,"\\.",1) == str_split_i(data_full$File,".csv",1) & str_split_i(student_file_mapping$student,"_",1) == str_split_i(data_full$student_file,"_",1) , paste0(student_file_mapping$student),"NOT FOUND" )
> data_full
student_file File student
1 paul_johns bash hunter 2022_exam1 johns bash hunter 2022_exam1.csv paul_johns
2 paul_johns bash hunter 2022_exam1 johns bash hunter 2022_exam1.csv NOT FOUND
3 paul_johns bash hunter 2022_exam1 johns bash hunter 2022_exam1.csv NOT FOUND
4 mary_johns bash hunter 2022_exam1 johns bash hunter 2022_exam1.csv NOT FOUND
5 mary_johns bash hunter 2022_exam1 johns bash hunter 2022_exam1.csv mary_bash
6 mary_johns bash hunter 2022_exam1 johns bash hunter 2022_exam1.csv NOT FOUND
7 paul_nichols simons smith 2022_exam1 nichols simons smith 2022_exam1.csv NOT FOUND
8 paul_nichols simons smith 2022_exam1 nichols simons smith 2022_exam1.csv NOT FOUND
9 paul_nichols simons smith 2022_exam1 nichols simons smith 2022_exam1.csv paul_simons
10 paul_johns bash hunter 2022_exam1 jons bash hunter 2022_exam1.csv NOT FOUND
11 mary_johns bash hunter 2022_exam1 johns bash hunter 2022_exam1.csv mary_bash
12 paul_nichols simons smith 2022_exam1 nichols simons smith 2022_exam1.csv paul_simons
В моем фактическом наборе данных он выполняет частичную замену, а также выдает ошибку longer object length is not a multiple of shorter object length, но не для данных примера.





Пришлось создать столбцы для имен и имен файлов, а затем использовать слияние с нечетким сопоставлением, которое устраняет орфографические ошибки!
library(stringr)
library(fuzzyjoin)
library(dplyr)
student_file_mapping$first_name <- str_split_i(student_file_mapping$student,"_",1)
student_file_mapping$file_name <- str_split_i(student_file_mapping$file,"\\.",1)
data_full$first_name <- str_split_i(data_full$student_file,"_",1)
data_full$file_name <- str_split_i(data_full$File,".csv",1)
data_merged <- stringdist_join(student_file_mapping,data_full,
by=c("first_name", "file_name"),
mode='left',
method = "lv")
> data_merged
student file first_name.x
1 paul_johns johns bash hunter 2022_exam1.paul_johns.csv paul
2 paul_johns johns bash hunter 2022_exam1.paul_johns.csv paul
3 paul_johns johns bash hunter 2022_exam1.paul_johns.csv paul
4 paul_johns johns bash hunter 2022_exam1.paul_johns.csv paul
5 mary_bash johns bash hunter 2022_exam1.mary_bash.csv mary
6 mary_bash johns bash hunter 2022_exam1.mary_bash.csv mary
7 mary_bash johns bash hunter 2022_exam1.mary_bash.csv mary
8 mary_bash johns bash hunter 2022_exam1.mary_bash.csv mary
9 paul_simons nichols simons smith 2022_exam1.paul_simonsh.csv paul
10 paul_simons nichols simons smith 2022_exam1.paul_simonsh.csv paul
11 paul_simons nichols simons smith 2022_exam1.paul_simonsh.csv paul
12 paul_simons nichols simons smith 2022_exam1.paul_simonsh.csv paul
file_name.x student_file
1 johns bash hunter 2022_exam1 paul_johns bash hunter 2022_exam1
2 johns bash hunter 2022_exam1 paul_johns bash hunter 2022_exam1
3 johns bash hunter 2022_exam1 paul_johns bash hunter 2022_exam1
4 johns bash hunter 2022_exam1 paul_johns bash hunter 2022_exam1
5 johns bash hunter 2022_exam1 mary_johns bash hunter 2022_exam1
6 johns bash hunter 2022_exam1 mary_johns bash hunter 2022_exam1
7 johns bash hunter 2022_exam1 mary_johns bash hunter 2022_exam1
8 johns bash hunter 2022_exam1 mary_johns bash hunter 2022_exam1
9 nichols simons smith 2022_exam1 paul_nichols simons smith 2022_exam1
10 nichols simons smith 2022_exam1 paul_nichols simons smith 2022_exam1
11 nichols simons smith 2022_exam1 paul_nichols simons smith 2022_exam1
12 nichols simons smith 2022_exam1 paul_nichols simons smith 2022_exam1
File first_name.y file_name.y
1 johns bash hunter 2022_exam1.csv paul johns bash hunter 2022_exam1
2 johns bash hunter 2022_exam1.csv paul johns bash hunter 2022_exam1
3 johns bash hunter 2022_exam1.csv paul johns bash hunter 2022_exam1
4 jons bash hunter 2022_exam1.csv paul jons bash hunter 2022_exam1
5 johns bash hunter 2022_exam1.csv mary johns bash hunter 2022_exam1
6 johns bash hunter 2022_exam1.csv mary johns bash hunter 2022_exam1
7 johns bash hunter 2022_exam1.csv mary johns bash hunter 2022_exam1
8 johns bash hunter 2022_exam1.csv mary johns bash hunter 2022_exam1
9 nichols simons smith 2022_exam1.csv paul nichols simons smith 2022_exam1
10 nichols simons smith 2022_exam1.csv paul nichols simons smith 2022_exam1
11 nichols simons smith 2022_exam1.csv paul nichols simons smith 2022_exam1
12 nichols simons smith 2022_exam1.csv paul nichols simons smith 2022_exam1