แทนที่รายการอักขระด้วยลำดับการลอยตามการแมปดาต้าเฟรม
ฉันมี data-frame ที่ทำแผนที่และ data-frame ขนาดใหญ่ที่แต่ละแถวแสดงถึงโปรตีนตามลำดับ
ฉันต้องการวิธีที่มีประสิทธิภาพในการแมปลำดับกับค่าที่สอดคล้องกับกรดอะมิโนตามกรอบข้อมูลการแมป
ฉันสามารถทำซ้ำตามลำดับและแทนที่ด้วยรหัสต่อไปนี้:
calcStickiness <- function(seq) {
seq_iter <- strsplit(unlist(seq), "")[[1]]
transformed_seq <- c()
for (c in seq_iter) {
transformed_seq <- c(transformed_seq, stickiness_tabel[stickiness_tabel["X"] == c][2])
}
print(transformed_seq)
}
# calling the function
calcStickiness(row["sequence_full"][1])
อยู่ที่ไหนstickiness_tabel
:
structure(list(X = c("K", "E", "D", "N", "Q", "S", "P", "R",
"T", "H", "A", "G", "M", "V", "L", "I", "F", "C", "Y", "W"),
x = c(-1.25639466063649, -0.928687786101206, -0.700106643211895,
-0.356971499674196, -0.295054350932285, -0.209468209138379,
-0.177787659972006, -0.0892949396458573, 0.0576667944592403,
0.215277407729333, 0.263739398989502, 0.556792734365241,
0.7448899445842, 0.900506232741908, 1.06680680601946, 1.18416532767113,
1.68723510186035, 1.70109173545121, 1.70150269278206, 2.01452547017961
)), class = "data.frame", row.names = c(NA, -20L))
ฉันต้องการทราบว่ามีวิธีที่เร็วกว่านี้หรือไม่เพราะ data-frame ของลำดับของฉันมีรายการจำนวนมากอยู่ในนั้น
แถวง่ายๆของ dataframe คือ:
structure(list(X = 1L, code = "12as_1", nsub2 = 2L, pdb_error2 = "NO",
QSBIO_err_prob = 3.5, chain_name = "B", sequence_full = "MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEKAVQVKVKALPDAQFEVVHSLAKWKRQTLGQHDFSAGEGLYTHMKALRPDEDRLSPLHSVYVDQWDWERVMGDGERQFSTLKSTVEAIWAGIKATEAAVSEEFGLAPFLPDQIHFVHSQELLSRYPDLDAKGRERAIAKDLGAVFLVGIGGKLSDGHRHDVRAPDYDDWSTPSELGHAGLNGDILVWNPVLEDAFELSSMGIRVDADTLKHQLALTGDEDRLELEWHQALLRGEMPQTIGGGIGQSRLTMLLLQLPHIGQVQAGVWPAAVRESVPSLL"), row.names = 1L, class = "data.frame")
sequence_full
ที่ฉันสนใจใน
แก้ไข
สำหรับแถวต่อไปนี้:
MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEKAVQVKVKALPDAQFEVVHSLAKWKRQTLGQHDFSAGEGLYTHMKALRPDEDRLSPLHSVYVDQWDWERVMGDGERQFSTLKSTVEAIWAGIKATEAAVSEEFGLAPFLPDQIHFVHSQELLSRYPDLDAKGRERAIAKDLGAVFLVGIGGKLSDGHRHDVRAPDYDDWSTPSELGHAGLNGDILVWNPVLEDAFELSSMGIRVDADTLKHQLALTGDEDRLELEWHQALLRGEMPQTIGGGIGQSRLTMLLLQLPHIGQVQAGVWPAAVRESVPSLL
ฉันต้องการได้รับสิ่งที่ชอบ:
[1] " 0.74488994" "-1.25639466" " 0.05766679" " 0.26373940" " 1.70150269" " 1.18416533" " 0.26373940" "-1.25639466" "-0.29505435"
[10] "-0.08929494" "-0.29505435" " 1.18416533" "-0.20946821" " 1.68723510" " 0.90050623" "-1.25639466" "-0.20946821" " 0.21527741"
[19] " 1.68723510" "-0.20946821" "-0.08929494" "-0.29505435" " 1.06680681" "-0.92868779" "-0.92868779" "-0.08929494" " 1.06680681"
[28] " 0.55679273" " 1.06680681" " 1.18416533" "-0.92868779" " 0.90050623" "-0.29505435" " 0.26373940" "-0.17778766" " 1.18416533"
[37] " 1.06680681" "-0.20946821" "-0.08929494" " 0.90050623" " 0.55679273" "-0.70010664" " 0.55679273" " 0.05766679" "-0.29505435"
[46] "-0.70010664" "-0.35697150" " 1.06680681" "-0.20946821" " 0.55679273" " 0.26373940" "-0.92868779" "-1.25639466" " 0.26373940"
[55] " 0.90050623" "-0.29505435" " 0.90050623" "-1.25639466" " 0.90050623" "-1.25639466" " 0.26373940" " 1.06680681" "-0.17778766"
[64] "-0.70010664" " 0.26373940" "-0.29505435" " 1.68723510" "-0.92868779" " 0.90050623" " 0.90050623" " 0.21527741" "-0.20946821"
[73] " 1.06680681" " 0.26373940" "-1.25639466" " 2.01452547" "-1.25639466" "-0.08929494" "-0.29505435" " 0.05766679" " 1.06680681"
[82] " 0.55679273" "-0.29505435" " 0.21527741" "-0.70010664" " 1.68723510" "-0.20946821" " 0.26373940" " 0.55679273" "-0.92868779"
[91] " 0.55679273" " 1.06680681" " 1.70150269" " 0.05766679" " 0.21527741" " 0.74488994" "-1.25639466" " 0.26373940" " 1.06680681"
[100] "-0.08929494" "-0.17778766" "-0.70010664" "-0.92868779" "-0.70010664" "-0.08929494" " 1.06680681" "-0.20946821" "-0.17778766"
[109] " 1.06680681" " 0.21527741" "-0.20946821" " 0.90050623" " 1.70150269" " 0.90050623" "-0.70010664" "-0.29505435" " 2.01452547"
[118] "-0.70010664" " 2.01452547" "-0.92868779" "-0.08929494" " 0.90050623" " 0.74488994" " 0.55679273" "-0.70010664" " 0.55679273"
[127] "-0.92868779" "-0.08929494" "-0.29505435" " 1.68723510" "-0.20946821" " 0.05766679" " 1.06680681" "-1.25639466" "-0.20946821"
[136] " 0.05766679" " 0.90050623" "-0.92868779" " 0.26373940" " 1.18416533" " 2.01452547" " 0.26373940" " 0.55679273" " 1.18416533"
[145] "-1.25639466" " 0.26373940" " 0.05766679" "-0.92868779" " 0.26373940" " 0.26373940" " 0.90050623" "-0.20946821" "-0.92868779"
[154] "-0.92868779" " 1.68723510" " 0.55679273" " 1.06680681" " 0.26373940" "-0.17778766" " 1.68723510" " 1.06680681" "-0.17778766"
[163] "-0.70010664" "-0.29505435" " 1.18416533" " 0.21527741" " 1.68723510" " 0.90050623" " 0.21527741" "-0.20946821" "-0.29505435"
[172] "-0.92868779" " 1.06680681" " 1.06680681" "-0.20946821" "-0.08929494" " 1.70150269" "-0.17778766" "-0.70010664" " 1.06680681"
[181] "-0.70010664" " 0.26373940" "-1.25639466" " 0.55679273" "-0.08929494" "-0.92868779" "-0.08929494" " 0.26373940" " 1.18416533"
[190] " 0.26373940" "-1.25639466" "-0.70010664" " 1.06680681" " 0.55679273" " 0.26373940" " 0.90050623" " 1.68723510" " 1.06680681"
[199] " 0.90050623" " 0.55679273" " 1.18416533" " 0.55679273" " 0.55679273" "-1.25639466" " 1.06680681" "-0.20946821" "-0.70010664"
[208] " 0.55679273" " 0.21527741" "-0.08929494" " 0.21527741" "-0.70010664" " 0.90050623" "-0.08929494" " 0.26373940" "-0.17778766"
[217] "-0.70010664" " 1.70150269" "-0.70010664" "-0.70010664" " 2.01452547" "-0.20946821" " 0.05766679" "-0.17778766" "-0.20946821"
[226] "-0.92868779" " 1.06680681" " 0.55679273" " 0.21527741" " 0.26373940" " 0.55679273" " 1.06680681" "-0.35697150" " 0.55679273"
[235] "-0.70010664" " 1.18416533" " 1.06680681" " 0.90050623" " 2.01452547" "-0.35697150" "-0.17778766" " 0.90050623" " 1.06680681"
[244] "-0.92868779" "-0.70010664" " 0.26373940" " 1.68723510" "-0.92868779" " 1.06680681" "-0.20946821" "-0.20946821" " 0.74488994"
[253] " 0.55679273" " 1.18416533" "-0.08929494" " 0.90050623" "-0.70010664" " 0.26373940" "-0.70010664" " 0.05766679" " 1.06680681"
[262] "-1.25639466" " 0.21527741" "-0.29505435" " 1.06680681" " 0.26373940" " 1.06680681" " 0.05766679" " 0.55679273" "-0.70010664"
[271] "-0.92868779" "-0.70010664" "-0.08929494" " 1.06680681" "-0.92868779" " 1.06680681" "-0.92868779" " 2.01452547" " 0.21527741"
[280] "-0.29505435" " 0.26373940" " 1.06680681" " 1.06680681" "-0.08929494" " 0.55679273" "-0.92868779" " 0.74488994" "-0.17778766"
[289] "-0.29505435" " 0.05766679" " 1.18416533" " 0.55679273" " 0.55679273" " 0.55679273" " 1.18416533" " 0.55679273" "-0.29505435"
[298] "-0.20946821" "-0.08929494" " 1.06680681" " 0.05766679" " 0.74488994" " 1.06680681" " 1.06680681" " 1.06680681" "-0.29505435"
[307] " 1.06680681" "-0.17778766" " 0.21527741" " 1.18416533" " 0.55679273" "-0.29505435" " 0.90050623" "-0.29505435" " 0.26373940"
[316] " 0.55679273" " 0.90050623" " 2.01452547" "-0.17778766" " 0.26373940" " 0.26373940" " 0.90050623" "-0.08929494" "-0.92868779"
[325] "-0.20946821" " 0.90050623" "-0.17778766" "-0.20946821" " 1.06680681" " 1.06680681"
จากนั้นควรส่งออกผลลัพธ์ไปยังไฟล์
คำตอบ
ฉันเรียกข้อมูลในลักษณะเดียวกับที่คุณทำ:
stickiness_tabel <- structure(list(X = c("K", "E", "D", "N", "Q", "S", "P", "R",
"T", "H", "A", "G", "M", "V", "L", "I", "F", "C", "Y", "W"),
x = c(-1.25639466063649, -0.928687786101206, -0.700106643211895,
-0.356971499674196, -0.295054350932285, -0.209468209138379,
-0.177787659972006, -0.0892949396458573, 0.0576667944592403,
0.215277407729333, 0.263739398989502, 0.556792734365241,
0.7448899445842, 0.900506232741908, 1.06680680601946, 1.18416532767113,
1.68723510186035, 1.70109173545121, 1.70150269278206, 2.01452547017961
)), class = "data.frame", row.names = c(NA, -20L))
row <- structure(list(X = 1L, code = "12as_1", nsub2 = 2L, pdb_error2 = "NO",
QSBIO_err_prob = 3.5, chain_name = "B", sequence_full = "MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEKAVQVKVKALPDAQFEVVHSLAKWKRQTLGQHDFSAGEGLYTHMKALRPDEDRLSPLHSVYVDQWDWERVMGDGERQFSTLKSTVEAIWAGIKATEAAVSEEFGLAPFLPDQIHFVHSQELLSRYPDLDAKGRERAIAKDLGAVFLVGIGGKLSDGHRHDVRAPDYDDWSTPSELGHAGLNGDILVWNPVLEDAFELSSMGIRVDADTLKHQLALTGDEDRLELEWHQALLRGEMPQTIGGGIGQSRLTMLLLQLPHIGQVQAGVWPAAVRESVPSLL"), row.names = 1L, class = "data.frame")
ตอนนี้สิ่งที่คุณทำได้คือ:
stickiness <- setNames(stickiness_tabel$x, stickiness_tabel$X)
lapply(strsplit(row$sequence_full, split = ""), function(x) stickiness[x])
ส่งคืนรายการเวกเตอร์ที่เป็นตัวเลข แต่ละองค์ประกอบของรายการสอดคล้องกับแถวที่คุณแปลงและแต่ละเวกเตอร์เป็นเวกเตอร์ที่มีชื่อของระดับความเหนียวที่ตั้งชื่อตามตัวอักษรที่เกี่ยวข้อง
นี่คือผลลัพธ์ที่คุณคาดหวังหรือไม่? เพราะยังไม่ชัดเจนสำหรับฉันจากคำถามของคุณ
บางทีdata.table
วิธีแก้ปัญหาจะเหมาะกับความต้องการของคุณ
ฉันสร้างชุดข้อมูลตัวอย่าง 1,000 แถวโดยทำซ้ำแถวที่คุณระบุ
library(data.table)
df <- row[rep(1, 1000),] #repeat row
df_dt <- setDT(df) # convert to data.table
value <- setNames(stickiness_tabel$x, stickiness_tabel$X)
start <- Sys.time()
df_dt[, sequence_full := lapply(sequence_full, function(x) value[unlist(strsplit(x, split = ""))])]
end <- Sys.time()
end - start
Time difference of 0.03744602 secs
df_dt[1, sequence_full]
[[1]]
M K T A Y I A K Q
0.74488994 -1.25639466 0.05766679 0.26373940 1.70150269 1.18416533 0.26373940 -1.25639466 -0.29505435
R Q I S F V K S H
-0.08929494 -0.29505435 1.18416533 -0.20946821 1.68723510 0.90050623 -1.25639466 -0.20946821 0.21527741
F S R Q L E E R L
1.68723510 -0.20946821 -0.08929494 -0.29505435 1.06680681 -0.92868779 -0.92868779 -0.08929494 1.06680681
G L I E V Q A P I
0.55679273 1.06680681 1.18416533 -0.92868779 0.90050623 -0.29505435 0.26373940 -0.17778766 1.18416533 ...
มันเปลี่ยนตารางความเหนียวของคุณให้เป็นเวกเตอร์และจัดทำดัชนีสำหรับแต่ละตารางsequence_full
ในแต่ละแถว
ในการส่งออกคุณสามารถทำได้:
write.csv(stack(unlist(df_dt[1, sequence_full])), file = "~/sequence_output.csv", row.names = F)
ซึ่งส่งคืน csv ด้วยคอลัมน์หนึ่งที่มีค่าความเหนียวและอีกคอลัมน์หนึ่งมีองค์ประกอบลำดับ