Игра с некоторыми данными CDC, чтобы сформулировать исследовательский вопрос. Я выполнял базовую статистику/анализ, а также делал базовые графики для лучшей визуализации данных.
Один из созданных мною сюжетов находится здесь:
Можно ли сделать это более ясным (в том смысле, что мы можем лучше визуализировать роды по размеру ловушки и подсчету самцов комаров)?
Вот мой код:
ggplot(Hawaii.cdc, aes(MalesCollected, TrapType)) +
geom_point(aes(color=TrapSite, shape=Genus)) +theme_bw()
Вот мои (случайно выбранные) данные:
> sampled_df<- sample_n(Hawaii.cdc, 50)
> dput(sampled_df)
structure(list(TrapType = c("BGS Trap", "BGS Trap", "BGS Trap",
"BGS Trap", "BGS Trap", "BGS Trap", "BGS Trap", "BGS Trap", "BGS Trap",
"BGS Trap", "BGS Trap", "BGS Trap", "BGS Trap", "BGS Trap", "Larval/Pupal Collection",
"BGS Trap", "BGS Trap", "BGS Trap", "BGS Trap", "UV Light Trap",
"BGS Trap", "BGS Trap", "BGS Trap", "BGS Trap", "BGS Trap", "BGS Trap",
"BGS Trap", "BGS Trap", "BGS Trap", "BGS Trap", "BGS Trap", "BGS Trap",
"BGS Trap", "BGS Trap", "BGS Trap", "BGS Trap", "BGS Trap", "UV Light Trap",
"BGS Trap", "BGS Trap", "BGS Trap", "BGS Trap", "BGS Trap", "BGS Trap",
"BGS Trap", "BGS Trap", "BGS Trap", "Larval/Pupal Collection",
"BGS Trap", "BGS Trap"), AttractantsUsed = c("Lure", "Lure",
"None", "Lure", "Lure", "Lure", "Lure", "Lure", "Lure", "Lure",
"CO2 and Lure", "Lure", "Lure", "Lure and UV Led Light", "None",
"CO2 and Lure", "Lure", "Lure", "Lure", "Lure", "CO2", "Lure",
"None", "Lure", "CO2 and Lure", "Hay or grass infusion", "Lure",
"Lure", "Lure", "Lure", "Lure", "Lure", "Lure", "Lure", "Lure",
"Lure", "Lure", "BG", "Lure", "Lure", "Lure", "Lure", "Lure",
"None", "Lure", "Lure", "Lure", "None", "CO2 and Lure and Led Uv Light",
"Lure"), TrapID = c("Parking #1", "Air strip #4", "Parking #3",
"#2", "#4", "Parking #2", "Air Strip #1", "Air strip #3", "Air strip #2",
"Parking #4", "Air Strip #1", "Air Strip #1", "Parking #4",
"Air strip #3", "0-6", "Baggage #1", "#4", "Parking #3", "Sewage #2",
"Parking #1", "Sewage #3", "Baggage #4", "Air Strip #4", "Air strip #2",
"Air Strip #4", "Air Strip #1", "Parking #3", "Air strip #2",
"Air strip #4", "Air strip #1", "Parking #2", "#3", "Parking #2",
"Parking #3", "Parking #1", "Parking #3", "Parking #1", "Air strip #1",
"Air Strip #1", "Air Strip #1", "Parking #2", "Air strip #2",
"#2", "Parking #3", "Air Strip #1", "Parking #2", "Air strip #4",
"HOVE Container", "Parking #1", "Air Strip #3"), Latitude = c(19.71314,
19.712502, 19.71311, 20.03724, 20.03977, 19.71312, 19.7121, 19.7124,
19.71224, 19.71311, 19.711245, 19.711245, 19.71311, 19.712048,
20.21925, 19.71502, 20.03977, 19.71311, 19.71238, 19.71314, 19.71239,
19.715, 19.7125, 19.7125, 19.7125, 19.711245, 19.71311, 19.7125,
19.7125, 19.71212, 19.71312, 20.03938, 19.71312, 19.71311, 19.71314,
19.71311, 19.71314, 19.71212, 19.7121, 19.71212, 19.71312, 19.7125,
20.03724, 19.71311, 19.71212, 19.71312, 19.7125, 19.088978, 19.71314,
19.71224), Longitude = c(-155.0396, -155.057462, -155.03906,
-155.82648, -155.83147, -155.03934, -155.05975, -155.05923, -155.05942,
-155.03876, -155.059803, -155.059803, -155.03876, -155.057852,
-155.75585, -155.04094, -155.83147, -155.03906, -155.03697, -155.0396,
-155.03725, -155.04027, -155.05894, -155.05942, -155.05894, -155.059803,
-155.03906, -155.05942, -155.05894, -155.05975, -155.03934, -155.82956,
-155.03934, -155.03906, -155.0396, -155.03906, -155.0396, -155.05975,
-155.05975, -155.05975, -155.03934, -155.05942, -155.82648, -155.03906,
-155.05975, -155.03934, -155.05894, -155.762254, -155.0396, -155.05923
), Address = c("Kekuanaoa Street", "Airport Road", "Kekuanaoa Street",
"Kawaihae Rd.", "Kawaihae Rd.", "Kekuanaoa Street", "Airport Road",
"Airport Road", "Airport Rd.", "Kekuanaoa Street", "Airport Road",
"Airport Road", "Kekuanaoa St.", "Airport Rd.", "Akoni Pule Hwy.Kohala",
"Kukuanaoa Street", "Kawaihae Road", "Kekuanaoa Street", "Kekuanaoa Street",
"Kekuanaoa St.", "Kekuanaoa St.", "Kekuanaoa St.", "Airport Road",
"Airport Road", "Airport Road", "Airport Road", "Kekuanaoa Street",
"Airport Road", "Airport Road", "Airport Road", "Kekuanaoa Street",
"Kawaihae Rd.", "Kekuanaoa St.", "Kekuanaoa St.", "Kekuanaoa St.",
"Kekuanaoa St.", "Kekuanaoa St.", "Airport Rd.", "Airport Road",
"Airport Rd.", "Kekuanaoa Street", "Airport Road", "Kawaihae Rd.",
"Kekuanaoa St.", "Airport Road", "Kekuanaoa Street", "Airport Road",
"Keaka Blvd.", "Kekuanaoa Street", "Airport Road"), Town = c("Hilo",
"Hilo", "Hilo", "Kawaihae", "Kawaihae", "Hilo", "Hilo", "Hilo",
"Hilo", "Hilo", "Hilo", "Hilo", "Hilo", "Hilo", "Kohala", "Hilo",
"Kawaihae", "Hilo", "Hilo", "Hilo", "Hilo", "Hilo", "Hilo", "Hilo",
"Hilo", "Hilo", "Hilo", "Hilo", "Hilo", "Hilo", "Hilo", "Kawaihae",
"Hilo", "Hilo", "Hilo", "Hilo", "Hilo", "Hilo", "Hilo", "Hilo",
"Hilo", "Hilo", "Kawaihae", "Hilo", "Hilo", "Hilo", "Hilo", "Ocean View",
"Hilo", "Hilo"), State = c("HI", "HI", "HI", "HI", "HI", "HI",
"HI", "HI", "HI", "HI", "HI", "HI", "HI", "HI", "HI", "HI", "HI",
"HI", "HI", "HI", "HI", "HI", "HI", "HI", "HI", "HI", "HI", "HI",
"HI", "HI", "HI", "HI", "HI", "HI", "HI", "HI", "HI", "HI", "HI",
"HI", "HI", "HI", "HI", "HI", "HI", "HI", "HI", "HI", "HI", "HI"
), County = c("Hawaii County", "Hawaii County", "Hawaii County",
"Hawaii County", "Hawaii County", "Hawaii County", "Hawaii County",
"Hawaii County", "Hawaii County", "Hawaii County", "Hawaii County",
"Hawaii County", "Hawaii County", "Hawaii County", "Hawaii County",
"Hawaii County", "Hawaii County", "Hawaii County", "Hawaii County",
"Hawaii County", "Hawaii County", "Hawaii County", "Hawaii County",
"Hawaii County", "Hawaii County", "Hawaii County", "Hawaii County",
"Hawaii County", "Hawaii County", "Hawaii County", "Hawaii County",
"Hawaii County", "Hawaii County", "Hawaii County", "Hawaii County",
"Hawaii County", "Hawaii County", "Hawaii County", "Hawaii County",
"Hawaii County", "Hawaii County", "Hawaii County", "Hawaii County",
"Hawaii County", "Hawaii County", "Hawaii County", "Hawaii County",
"Hawaii County", "Hawaii County", "Hawaii County"), TrapSite = c("Airport",
"Airport", "Airport", "Business", "Business", "Airport", "Airport",
"Airport", "Business", "Airport", "Airport", "Airport", "Business",
"Airport", "Residential", "Airport", "Business", "Airport", "Airport",
"Airport", "Business", "Airport", "Airport", "Airport", "Airport",
"Airport", "Airport", "Airport", "Airport", "Airport", "Airport",
"Business", "Business", "Business", "Business", "Business", "Airport",
"Airport", "Airport", "Business", "Airport", "Airport", "Business",
"Business", "Airport", "Airport", "Airport", "Residential", "Airport",
"Airport"), TrapSet = c("05/23/2017", "01/06/2020", "09/25/2017",
"08/29/2017", "07/04/2017", "02/05/2018", "01/28/2019", "04/02/2018",
"12/10/2018", "03/09/2020", "07/29/2019", "03/25/2019", "12/16/2018",
"01/21/2020", "07/12/2017", "06/21/2017", "11/20/2017", "06/06/2017",
"06/19/2017", "12/19/2017", "06/21/2017", "08/07/2017", "06/19/2017",
"08/27/2018", "06/22/2017", "07/08/2019", "06/25/2018", "04/02/2018",
"09/03/2018", "03/12/2018", "10/29/2018", "07/12/2017", "01/21/2019",
"01/14/2019", "01/07/2019", "12/10/2018", "09/06/2017", "12/06/2017",
"12/03/2018", "07/10/2017", "07/09/2018", "07/02/2018", "06/13/2017",
"06/21/2017", "07/26/2017", "10/16/2017", "10/01/2018", "10/17/2017",
"09/16/2019", "11/26/2018"), SetTimeOfDay = c("Morning", "Morning",
"Morning", "Afternoon", "Afternoon", "Morning", "Morning", "Morning",
"Morning", "Morning", "Morning", "Morning", "Morning", "Morning",
"Afternoon", "Morning", "Morning", "Morning", "Morning", "Afternoon",
"Afternoon", "Afternoon", "Morning", "Morning", "Morning", "Morning",
"Morning", "Morning", "Morning", "Morning", "Morning", "Afternoon",
"Morning", "Morning", "Morning", "Morning", "Afternoon", "Afternoon",
"Morning", "Afternoon", "Morning", "Morning", "Afternoon", "Afternoon",
"Morning", "Morning", "Morning", "Afternoon", "Morning", "Morning"
), TrapCollect = c("05/25/2017", "01/10/2020", "09/26/2017",
"08/31/2017", "07/05/2017", "02/08/2018", "02/04/2019", "04/06/2018",
"12/14/2018", "03/13/2020", "08/02/2019", "03/29/2019", "12/20/2018",
"01/24/2020", "07/12/2017", "06/22/2017", "11/22/2017", "06/08/2017",
"06/20/2017", "12/20/2017", "06/22/2017", "08/09/2017", "06/20/2017",
"08/31/2018", "06/23/2017", "07/12/2019", "06/29/2018", "04/06/2018",
"09/07/2018", "03/16/2018", "11/02/2018", "07/14/2017", "01/28/2019",
"01/18/2019", "01/14/2019", "12/14/2018", "09/08/2017", "12/07/2017",
"12/07/2018", "07/11/2017", "07/13/2018", "07/06/2018", "06/15/2017",
"06/22/2017", "07/28/2017", "10/20/2017", "10/05/2018", "10/17/2017",
"09/20/2019", "11/30/2018"), CollectTimeOfDay = c("Morning",
"Morning", "Morning", "Afternoon", "Afternoon", "Morning", "Morning",
"Morning", "Morning", "Morning", "Morning", "Morning", "Morning",
"Morning", "Afternoon", "Morning", "Morning", "Morning", "Morning",
"Afternoon", "Afternoon", "Afternoon", "Morning", "Morning",
"Morning", "Morning", "Morning", "Morning", "Morning", "Morning",
"Morning", "Afternoon", "Morning", "Morning", "Morning", "Morning",
"Afternoon", "Afternoon", "Morning", "Afternoon", "Morning",
"Morning", "Afternoon", "Afternoon", "Morning", "Morning", "Morning",
"Afternoon", "Morning", "Morning"), Genus = c("Aedes", "Aedes",
"Aedes", "Aedes", "Aedes", "Culex", "Culex", "Culex", "Aedes",
"Aedes", "Aedes", "Aedes", "Aedes", "Aedes", "Aedes", "Aedes",
"Aedes", "Aedes", "Aedes", "Aedes", "Culex", "Culex", "Aedes",
"Aedes", "Culex", "Culex", "Aedes", "Culex", "Culex", "Culex",
"Aedes", "Aedes", "Aedes", "Culex", "Aedes", "Aedes", "Culex",
"Culex", "Aedes", "Aedes", "Aedes", "Aedes", "Aedes", "Aedes",
"Aedes", "Aedes", "Aedes", "Aedes", "Aedes", "Aedes"), Species = c("albopictus",
"albopictus", "albopictus", "aegypti", "albopictus", "quinquefasciatus",
"quinquefasciatus", "quinquefasciatus", "albopictus", "albopictus",
"albopictus", "albopictus", "albopictus", "albopictus", "albopictus",
"albopictus", "aegypti", "albopictus", "albopictus", "vexans",
"quinquefasciatus", "quinquefasciatus", "albopictus", "albopictus",
"quinquefasciatus", "quinquefasciatus", "albopictus", "quinquefasciatus",
"quinquefasciatus", "quinquefasciatus", "albopictus", "aegypti",
"albopictus", "quinquefasciatus", "albopictus", "albopictus",
"quinquefasciatus", "quinquefasciatus", "albopictus", "albopictus",
"albopictus", "albopictus", "albopictus", "albopictus", "albopictus",
"albopictus", "albopictus", "aegypti", "albopictus", "albopictus"
), LifeStage = c("Adult", "Adult", "Adult", "Adult", "Adult",
"Adult", "Adult", "Adult", "Adult", "Adult", "Adult", "Adult",
"Adult", "Adult", "Larvae/pupae", "Adult", "Adult", "Adult",
"Adult", "Adult", "Adult", "Adult", "Adult", "Adult", "Adult",
"Adult", "Adult", "Adult", "Adult", "Adult", "Adult", "Adult",
"Adult", "Adult", "Adult", "Adult", "Adult", "Adult", "Adult",
"Adult", "Adult", "Adult", "Adult", "Adult", "Adult", "Adult",
"Adult", "Larvae/pupae", "Adult", "Adult"), EggsCollected = c("No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No"), LarvaeCollected = c("No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "Yes", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "Yes", "No", "No"), PupaeCollected = c("No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "Yes", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "Yes", "No", "No"), FemalesCollected = c(32, 1, 10,
5, 1, 7, 2, 4, 3, 7, 33, 19, 0, 8, 15, 1, 2, 8, 1, 3, 12, 1,
1, 1, 1, 2, 3, 1, 10, 2, 15, 1, 18, 2, 29, 2, 3, 1, 9, 3, 6,
5, 4, 2, 0, 4, 6, 1, 22, 2), MalesCollected = c(54, 0, 9, 3,
1, 0, 0, 0, 0, 0, 4, 5, 2, 0, 12, 0, 15, 4, 0, 2, 0, 0, 5, 1,
0, 0, 0, 0, 0, 0, 6, 0, 3, 0, 6, 0, 0, 1, 4, 3, 1, 2, 3, 1, 1,
1, 3, 0, 5, 0), UnknownCollected = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
), ReportDate = c("08/01/2017 8:31 PM", "05/07/2020 4:13 PM",
"04/19/2018 8:44 PM", "04/19/2018 8:44 PM", "08/01/2017 8:31 PM",
"01/31/2019 8:05 PM", "07/30/2019 9:38 PM", "10/05/2018 9:15 PM",
"07/30/2019 9:38 PM", "05/07/2020 4:13 PM", "05/07/2020 4:13 PM",
"05/07/2020 4:13 PM", "07/30/2019 9:38 PM", "05/07/2020 4:13 PM",
"04/19/2018 8:44 PM", "08/01/2017 8:31 PM", "04/19/2018 8:44 PM",
"08/01/2017 8:31 PM", "08/01/2017 8:31 PM", "04/19/2018 8:44 PM",
"08/01/2017 8:31 PM", "04/19/2018 8:44 PM", "08/01/2017 8:31 PM",
"10/05/2018 9:15 PM", "08/01/2017 8:31 PM", "05/07/2020 4:13 PM",
"10/05/2018 9:15 PM", "10/05/2018 9:15 PM", "10/05/2018 9:15 PM",
"10/05/2018 9:15 PM", "01/31/2019 8:45 PM", "08/01/2017 8:31 PM",
"07/30/2019 9:38 PM", "07/30/2019 9:38 PM", "07/30/2019 9:38 PM",
"07/30/2019 9:38 PM", "04/19/2018 8:44 PM", "04/19/2018 8:44 PM",
"07/30/2019 9:38 PM", "08/01/2017 8:31 PM", "01/31/2019 8:22 PM",
"10/05/2018 9:15 PM", "08/01/2017 8:31 PM", "08/01/2017 8:31 PM",
"04/19/2018 8:44 PM", "04/19/2018 8:44 PM", "11/20/2018 7:37 PM",
"04/19/2018 8:44 PM", "05/07/2020 4:13 PM", "07/30/2019 9:38 PM"
)), row.names = c(NA, -50L), spec = structure(list(cols = list(
TrapType = structure(list(), class = c("collector_character",
"collector")), AttractantsUsed = structure(list(), class = c("collector_character",
"collector")), TrapID = structure(list(), class = c("collector_character",
"collector")), Latitude = structure(list(), class = c("collector_double",
"collector")), Longitude = structure(list(), class = c("collector_double",
"collector")), Address = structure(list(), class = c("collector_character",
"collector")), Town = structure(list(), class = c("collector_character",
"collector")), State = structure(list(), class = c("collector_character",
"collector")), County = structure(list(), class = c("collector_character",
"collector")), TrapSite = structure(list(), class = c("collector_character",
"collector")), TrapSet = structure(list(), class = c("collector_character",
"collector")), SetTimeOfDay = structure(list(), class = c("collector_character",
"collector")), TrapCollect = structure(list(), class = c("collector_character",
"collector")), CollectTimeOfDay = structure(list(), class = c("collector_character",
"collector")), Genus = structure(list(), class = c("collector_character",
"collector")), Species = structure(list(), class = c("collector_character",
"collector")), LifeStage = structure(list(), class = c("collector_character",
"collector")), EggsCollected = structure(list(), class = c("collector_character",
"collector")), LarvaeCollected = structure(list(), class = c("collector_character",
"collector")), PupaeCollected = structure(list(), class = c("collector_character",
"collector")), FemalesCollected = structure(list(), class = c("collector_double",
"collector")), MalesCollected = structure(list(), class = c("collector_double",
"collector")), UnknownCollected = structure(list(), class = c("collector_double",
"collector")), ReportDate = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), delim = ","), class = "col_spec"), problems = <pointer: 0x600002e98b20>, class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))
Это похоже на вопрос мнения, но мои предложения будут следующими: 1) не используйте цвет для отображения 13 категорий - трудно надежно различить более 10 различных цветов; может быть, лучше использовать фасеты для этой переменной. 2) Много перерисовок, из-за чего трудно различать формы и цвета — здесь лучше использовать geom_jitter. 3) Часто полезно сортировать по чему-то значимому (например, по количеству собранных мужчин) с использованием факторов (например, %>% mutate(TrapType = forcats::fct_reorder(TrapType, MalesCollected))
), а не просто использовать алфавитный порядок.
«Самая четкая» визуализация зависит от того, что вы хотите показать. Вы показываете здесь 4 разные переменные, по-видимому, пытаясь ответить на вопрос «Насколько эффективен каждый тип ловушки против какого рода в каком месте?» это может быть полезно для EDA, чтобы понять некоторые шаблоны, но это не будет хорошо для общения, так как оно показывает много одновременных сообщений, ни одного четко. Придумайте более узкий вопрос или наблюдение и адаптируйте свою визуализацию, чтобы подчеркнуть это. например «Большинство ловушек находилось в аэропортах» или «Ловушки Ovicup поймали больше комаров Culex, чем ловушки с ультрафиолетовым излучением».
Как насчет чего-то вроде тепловой карты:
tmp <- sampled_df %>%
group_by(TrapType, TrapSite, Genus) %>%
summarise(n = sum(MalesCollected)) %>%
group_by(Genus) %>%
mutate(pct = n/sum(n))
ggplot(tmp, aes(x=TrapSite, y=TrapType, fill=pct)) +
geom_tile(col = "white") +
facet_wrap(~Genus, ncol=1) +
theme_bw() +
theme(panel.grid=element_blank())
Это мое предложение:
position_jitter
library(tidyverse)
ggplot(Hawaii.cdc, aes(y=log(MalesCollected), x=TrapType)) +
geom_point(aes(color=TrapSite, shape = Genus),
position = position_jitter(width=0.2, height=0.5), size = 2) +
ylab("Males Collected")+
theme_bw()
проверьте clauswilke.com/dataviz для некоторых отличных интро/идей по визуализации.