Functions
get_file <- function(x) {
dir.create("data", showWarnings=FALSE)
file <- file.path("data", x)
if (!file.exists(file)) download.file(
paste0("https://zenodo.org/record/5142676/files/", x, "?download=1"), file)
file
}
mutate_cond <- function(.data, condition, ..., envir = parent.frame()) {
require(dplyr)
condition <- eval(substitute(condition), .data, envir)
.data[condition, ] <- mutate(.data[condition, ], ...)
.data
}
get_main <- function(repo, data, n=100) {
require(dplyr)
subcat <- data %>%
filter(repository %in% repo) %>%
group_by(subcategory) %>%
summarise(weight = n(), .groups="drop") %>%
arrange(desc(weight)) %>%
pull(subcategory)
data %>%
filter(repository %in% repo) %>%
group_by(id) %>%
summarise(main = subcat[subcat %in% subcategory][1], .groups="drop") %>%
group_by(main) %>%
summarise(N = n(), .groups="drop") %>%
arrange(desc(N)) %>%
filter(N >= n) %>%
pull(main)
}
# performance
model_performance <- function(m) {
require(ggplot2)
# obs. vs fitted
df <- data.frame(fitted=fitted(m), response=model.response(model.frame(m)))
ggplot(df) + aes(fitted, response) +
geom_point(alpha=0.3) + geom_abline() + geom_smooth(method="lm") +
ggpmisc::stat_poly_eq(formula=y~x, parse=TRUE)
}
library(dplyr)
articles <- readr::read_csv(get_file("categories.csv")) %>%
left_join(readr::read_csv(get_file("articles.csv")), by="id") %>%
rename(subcategory = subcategory_name) %>%
select(id, repository, category, subcategory, date) %>%
distinct() %>%
collect() %>%
# define main categories for repo != arXiv
mutate_cond(subcategory %in% c("Epidemiology", "Clinical Trials"), repository = "medrxiv") %>%
mutate_cond(repository == "medrxiv", category = "Health Sciences") %>%
mutate_cond(repository == "biorxiv", category = "Biology") %>%
mutate_cond(repository == "psyarxiv", category = "Psychology") %>%
mutate_cond(repository == "socarxiv", category = "Social Sciences")
articles %>%
group_by(repository, category) %>%
summarise(subcategories=length(unique(subcategory)), n=n())
articles <- articles %>%
# mark lockdown period & move time reference to 0
mutate(year = lubridate::year(date), month = lubridate::month(date)) %>%
mutate(lockdown = year == 2020 & month > 2) %>%
mutate(yearmon = (year - min(year)) + (month - 1) / 12) %>%
# adjustments to Biology and Economics
mutate_cond(category == "Quantitative Biology", category = "Biology") %>%
mutate_cond(category == "Quantitative Finance", category = "Economics") %>%
# adjustments to socarxiv
filter(subcategory != "Social and Behavioral Sciences") %>%
mutate_cond(repository == "socarxiv" & subcategory %in% c(
"Science and Technology Studies",
"Environmental Studies",
"International and Area Studies",
"Leisure Studies",
"Organization Development",
"Leadership Studies"
), subcategory = "Other Social and Behavioral Sciences") %>%
mutate_cond(repository == "socarxiv" & subcategory %in% c(
"Library and Information Science",
"Linguistics"
), subcategory = "Arts and Humanities") %>%
mutate_cond(repository == "socarxiv" & subcategory %in% c(
"Urban Studies and Planning",
"Social Statistics"
), subcategory = "Sociology") %>%
mutate_cond(repository == "socarxiv" & subcategory %in% c(
"Social Work",
"Legal Studies"
), subcategory = "Law") %>%
mutate_cond(repository == "socarxiv" & subcategory %in% c(
"Public Affairs, Public Policy and Public Administration"
), subcategory = "Political Science") %>%
mutate_cond(repository == "socarxiv" & subcategory %in% c(
"Agricultural and Resource Economics"
), subcategory = "Economics") %>%
mutate_cond(id %in% filter(
., repository == "socarxiv" & subcategory == "Psychology")$id,
category = "Psychology") %>%
filter(subcategory != "Psychology") %>%
na.omit() %>%
distinct() %>%
# get just main categories with more than 100 papers for socarxiv & psyarxiv
filter((!repository %in% c("socarxiv", "psyarxiv")) | subcategory %in% unlist(
lapply(c("socarxiv", "psyarxiv"), get_main, .)))
articles %>%
group_by(category) %>%
summarise(subcategories=length(unique(subcategory)), n=n())
text <- readr::read_csv(get_file("text.csv")) %>%
select(id, title) %>%
collect() %>%
mutate(covidpaper = grepl("covid-19", title, ignore.case=TRUE) |
grepl("sars-cov-2", title, ignore.case=TRUE) |
grepl("coronavirus", title, ignore.case=TRUE))
## Warning: One or more parsing issues, see `problems()` for details
authors <- readr::read_csv(get_file("authors.csv")) %>%
distinct() %>%
collect() %>%
mutate(across(probability, as.numeric)) %>%
mutate(across(alphabetical_ordered, as.logical)) %>%
mutate_cond(is.na(alphabetical_ordered), alphabetical_ordered=TRUE)
merge_info <- function(.data) .data %>%
left_join(articles, by="id") %>%
left_join(text[, c("id", "covidpaper")], by="id") %>%
distinct() %>%
na.omit() %>%
mutate_cond(year < 2020, covidpaper = FALSE) %>%
group_by(subcategory) %>%
filter(min(year) < 2020) %>%
ungroup()
# features all
df.id <- authors %>%
filter(probability >= 0.95 & count >= 10) %>%
group_by(id) %>%
summarise(n_male = sum(gender == "male", na.rm=TRUE),
n_female = sum(gender == "female", na.rm=TRUE),
n_na = sum(is.na(gender)),
total = n_male + n_female, .groups="drop") %>%
merge_info() %>%
# remove papers with more than 25% of authors with missing gender
filter(n_na < 0.25 * (n_na + n_male + n_female)) %>%
# remove observations with less than 30 people per month
group_by(repository, category, subcategory, covidpaper, lockdown, yearmon) %>%
filter(sum(total) > 30) %>%
ungroup()
df.all <- df.id %>%
group_by(repository, category, subcategory, covidpaper, lockdown, yearmon) %>%
summarise(p_male = sum(n_male) / sum(total),
total = sum(total), .groups="drop")
summarise_solo <- function(.data) .data %>%
group_by(repository, category, subcategory, covidpaper, lockdown, yearmon) %>%
summarise(n_male = sum(gender == "male", na.rm=TRUE),
n_female = sum(gender == "female", na.rm=TRUE),
total = n_male + n_female,
p_male = n_male / total, .groups="drop")
# features first
df.first <- authors %>%
filter(probability >= 0.95 & count >= 10) %>%
filter(!alphabetical_ordered & rank == "first") %>%
merge_info() %>%
summarise_solo()
# features last
df.last <- authors %>%
filter(probability >= 0.95 & count >= 10) %>%
filter(!alphabetical_ordered & rank == "last") %>%
merge_info() %>%
summarise_solo()
# features single
df.single <- authors %>%
filter(probability >= 0.95 & count >= 10) %>%
filter(rank == "single") %>%
merge_info() %>%
summarise_solo()
articles %>%
select(id) %>%
distinct() %>%
count()
articles %>%
select(category, subcategory) %>%
distinct() %>%
count(category)
articles %>%
select(category, id) %>%
distinct() %>%
count(category)
library(ggplot2)
theme_set(theme_bw())
df.plot.papers <- df.id %>%
select(yearmon, category, id) %>%
distinct() %>%
count(yearmon, category)
df.plot.papers.all <- df.plot.papers %>%
group_by(yearmon) %>%
summarise(n = sum(n), .groups="drop") %>%
mutate(category="(all)")
ggplot(df.plot.papers) +
aes(yearmon+2017, n) + facet_wrap(~category, scales="free_y") +
geom_col(data=df.plot.papers.all) +
geom_smooth(data=df.plot.papers.all) +
geom_col() + geom_smooth() +
zoo::scale_x_yearmon(format="%Y") +
labs(y="Number of papers per month", x=NULL)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
df.plot.p_male <- df.id %>%
select(yearmon, category, id, n_male, total) %>%
distinct() %>%
group_by(yearmon, category) %>%
summarise(p_male = sum(n_male) / sum(total), .groups="drop")
df.plot.p_male.all <- df.id %>%
group_by(yearmon) %>%
summarise(p_male = sum(n_male) / sum(total), .groups="drop") %>%
mutate(category="(all)")
ggplot(df.plot.p_male) +
aes(yearmon+2017, p_male) + facet_wrap(~category) +
geom_point(data=df.plot.p_male.all) +
geom_smooth(method="gam", data=df.plot.p_male.all) +
geom_point() + geom_smooth(method="gam") +
zoo::scale_x_yearmon(format="%Y") +
labs(y="Proportion of male authors", x=NULL)
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
model.glm <- p_male ~ yearmon + lockdown + covidpaper
fit.glm <- glm(model.glm, df.all, family=binomial, weights=total)
summary(fit.glm)
##
## Call:
## glm(formula = model.glm, family = binomial, data = df.all, weights = total)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -26.0616 -1.3754 0.9651 2.5727 16.1094
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.826402 0.006090 299.901 <2e-16 ***
## yearmon -0.104284 0.003042 -34.283 <2e-16 ***
## lockdownTRUE 0.071570 0.007224 9.908 <2e-16 ***
## covidpaperTRUE -0.613698 0.019441 -31.568 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 64761 on 3367 degrees of freedom
## Residual deviance: 62173 on 3364 degrees of freedom
## AIC: 79001
##
## Number of Fisher Scoring iterations: 4
model_performance(fit.glm)
## Warning in .recacheSubclasses(def@className, def, env): undefined subclass
## "numericVector" of class "Mnumeric"; definition not updated
## `geom_smooth()` using formula 'y ~ x'
performance::check_model(fit.glm)
library(lme4)
## Loading required package: Matrix
model <- p_male ~ yearmon + lockdown + covidpaper + (1 | category/subcategory)
fit.all <- glmer(model, df.all, family=binomial, weights=total)
summary(fit.all)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: p_male ~ yearmon + lockdown + covidpaper + (1 | category/subcategory)
## Data: df.all
## Weights: total
##
## AIC BIC logLik deviance df.resid
## 22667.8 22704.5 -11327.9 22655.8 3362
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -6.8130 -0.7773 0.0898 0.8462 6.2945
##
## Random effects:
## Groups Name Variance Std.Dev.
## subcategory:category (Intercept) 0.0852 0.2919
## category (Intercept) 0.3838 0.6196
## Number of obs: 3368, groups: subcategory:category, 192; category, 10
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.595286 0.199014 8.016 1.09e-15 ***
## yearmon -0.049239 0.003202 -15.378 < 2e-16 ***
## lockdownTRUE 0.030571 0.007491 4.081 4.48e-05 ***
## covidpaperTRUE 0.076362 0.021791 3.504 0.000458 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) yearmn lcTRUE
## yearmon -0.030
## lockdwnTRUE 0.011 -0.629
## covdpprTRUE -0.005 -0.001 -0.143
model_performance(fit.all)
## `geom_smooth()` using formula 'y ~ x'
performance::check_model(fit.all)
df.re.category <- sjPlot::get_model_data(fit.all, "re")[[2]] %>%
mutate(group = ifelse(conf.low < 1 & conf.high > 1, NA, group)) %>%
mutate(term = forcats::fct_reorder(term, estimate))
## Registered S3 methods overwritten by 'parameters':
## method from
## as.double.parameters_kurtosis datawizard
## as.double.parameters_skewness datawizard
## as.double.parameters_smoothness datawizard
## as.numeric.parameters_kurtosis datawizard
## as.numeric.parameters_skewness datawizard
## as.numeric.parameters_smoothness datawizard
## print.parameters_distribution datawizard
## print.parameters_kurtosis datawizard
## print.parameters_skewness datawizard
## summary.parameters_kurtosis datawizard
## summary.parameters_skewness datawizard
## Warning in checkDepPackageVersion(dep_pkg = "TMB"): Package version inconsistency detected.
## glmmTMB was built with TMB version 1.7.21
## Current TMB version is 1.7.22
## Please re-install glmmTMB from source or restore original 'TMB' package (see '?reinstalling' for more information)
ggplot(df.re.category) +
aes(estimate, term, xmin=conf.low, xmax=conf.high, color=group) +
geom_vline(xintercept=1, color="black", linetype="dashed") +
geom_errorbarh(height=0, size=1) + geom_point(size=2) +
scale_x_log10(breaks=c(0.2, 0.5, 1, 2, 5)) +
scale_color_manual(
breaks=c("pos", "neg"), values=c("blue", "red"),
na.value=grDevices::adjustcolor("grey50", 0.5)) +
labs(y=NULL, x="Odds ratio", title="Random effect: category") +
theme(legend.position="none")
df.re.subcategory <- sjPlot::get_model_data(fit.all, "re")[[1]] %>%
mutate(group = ifelse(conf.low < 1 & conf.high > 1, NA, group)) %>%
mutate(facet = sapply(strsplit(as.character(term), ":"), tail, 1)) %>%
mutate(term = forcats::fct_reorder(term, estimate))
ggplot(df.re.subcategory) +
aes(estimate, term, xmin=conf.low, xmax=conf.high, color=group) +
facet_grid("facet", scales="free_y", space="free_y") +
geom_vline(xintercept=1, color="black", linetype="dashed") +
geom_errorbarh(height=0, size=1) + geom_point(size=2) +
scale_x_log10(breaks=c(0.2, 0.5, 1, 2, 5)) +
scale_color_manual(
breaks=c("pos", "neg"), values=c("blue", "red"),
na.value=grDevices::adjustcolor("grey50", 0.5)) +
labs(y=NULL, x="Odds ratio", title="Random effect: subcategory") +
theme(legend.position="none")
fit <- list(
glm = fit.glm,
all = fit.all,
first = glmer(model, df.first, family=binomial, weights=total),
last = glmer(model, df.last, family=binomial, weights=total),
single = glmer(model, df.single, family=binomial, weights=total)
)
df.terms <- lapply(fit, sjPlot::get_model_data, "est") %>%
bind_rows(.id="fit") %>%
mutate(type = ifelse(fit=="glm", "GLM", "GLMM")) %>%
mutate(fit = ifelse(fit=="glm", "all", fit)) %>%
mutate(fit = forcats::fct_inorder(fit)) %>%
mutate(type = ifelse(p.value >= 0.05, NA, type))
levels(df.terms$term) <- c("COVID paper", "lockdown", "year")
stargazer::stargazer(fit, type="html")
Dependent variable: | |||||
p_male | |||||
logistic | generalized linear | ||||
mixed-effects | |||||
(1) | (2) | (3) | (4) | (5) | |
yearmon | -0.104*** | -0.049*** | -0.044*** | -0.059*** | -0.029 |
(0.003) | (0.003) | (0.009) | (0.009) | (0.018) | |
lockdown | 0.072*** | 0.031*** | 0.035* | 0.008 | 0.136*** |
(0.007) | (0.007) | (0.020) | (0.021) | (0.052) | |
covidpaper | -0.614*** | 0.076*** | 0.399*** | 0.129** | 0.715** |
(0.019) | (0.022) | (0.064) | (0.065) | (0.298) | |
Constant | 1.826*** | 1.595*** | 1.520*** | 1.819*** | 2.480*** |
(0.006) | (0.199) | (0.195) | (0.173) | (0.169) | |
Observations | 3,368 | 3,368 | 3,996 | 4,027 | 3,517 |
Log Likelihood | -39,496.310 | -11,327.900 | -7,182.127 | -6,955.634 | -3,788.911 |
Akaike Inf. Crit. | 79,000.620 | 22,667.800 | 14,376.250 | 13,923.270 | 7,589.823 |
Bayesian Inf. Crit. | 22,704.540 | 14,414.010 | 13,961.070 | 7,626.815 | |
Note: | p<0.1; p<0.05; p<0.01 |
ggplot(df.terms) +
aes(estimate, term, xmin=conf.low, xmax=conf.high, color=type) +
facet_grid("fit", scales="free_y", space="free_y") +
geom_vline(xintercept=1, color="black", linetype="dashed") +
geom_errorbarh(height=0, size=1) + geom_point(size=2) +
ggrepel::geom_text_repel(
aes(label=p.label), nudge_y=0.3, nudge_x=0.01, segment.size=0, show.legend=FALSE) +
scale_x_log10(breaks=c(0.2, 0.5, 1, 2, 5)) +
scale_color_manual(
breaks=c("GLM", "GLMM"), values=c("red", "black"),
na.value=grDevices::adjustcolor("grey50", 0.5)) +
labs(y=NULL, x="Odds ratio", color="Model") +
theme(legend.position=c(.99, .99), legend.justification=c(1, 1)) +
theme(legend.background=element_rect(fill="#ffffff88"))