Functions

get_file <- function(x) {
  dir.create("data", showWarnings=FALSE)
  file <- file.path("data", x)
  if (!file.exists(file)) download.file(
    paste0("https://zenodo.org/record/5142676/files/", x, "?download=1"), file)
  file
}

mutate_cond <- function(.data, condition, ..., envir = parent.frame()) {
  require(dplyr)
  
  condition <- eval(substitute(condition), .data, envir)
  .data[condition, ] <- mutate(.data[condition, ], ...)
  .data
}

get_main <- function(repo, data, n=100) {
  require(dplyr)
  
  subcat <- data %>%
    filter(repository %in% repo) %>%
    group_by(subcategory) %>%
    summarise(weight = n(), .groups="drop") %>%
    arrange(desc(weight)) %>%
    pull(subcategory)
  
  data %>%
    filter(repository %in% repo) %>%
    group_by(id) %>%
    summarise(main = subcat[subcat %in% subcategory][1], .groups="drop") %>%
    group_by(main) %>%
    summarise(N = n(), .groups="drop") %>%
    arrange(desc(N)) %>%
    filter(N >= n) %>%
    pull(main)
}

# performance
model_performance <- function(m) {
  require(ggplot2)
  
  # obs. vs fitted
  df <- data.frame(fitted=fitted(m), response=model.response(model.frame(m)))
  ggplot(df) + aes(fitted, response) +
    geom_point(alpha=0.3) + geom_abline() + geom_smooth(method="lm") +
    ggpmisc::stat_poly_eq(formula=y~x, parse=TRUE)
}

1 Data preparation

1.1 Cleaning and filtering

library(dplyr)

articles <- readr::read_csv(get_file("categories.csv")) %>%
  left_join(readr::read_csv(get_file("articles.csv")), by="id") %>%
  rename(subcategory = subcategory_name) %>%
  select(id, repository, category, subcategory, date) %>%
  distinct() %>%
  collect() %>%
  
  # define main categories for repo != arXiv
  mutate_cond(subcategory %in% c("Epidemiology", "Clinical Trials"), repository = "medrxiv") %>%
  mutate_cond(repository == "medrxiv", category = "Health Sciences") %>%
  mutate_cond(repository == "biorxiv", category = "Biology") %>%
  mutate_cond(repository == "psyarxiv", category = "Psychology") %>%
  mutate_cond(repository == "socarxiv", category = "Social Sciences")

articles %>%
  group_by(repository, category) %>%
  summarise(subcategories=length(unique(subcategory)), n=n())
articles <- articles %>%
  # mark lockdown period & move time reference to 0
  mutate(year = lubridate::year(date), month = lubridate::month(date)) %>%
  mutate(lockdown = year == 2020 & month > 2) %>%
  mutate(yearmon = (year - min(year)) + (month - 1) / 12) %>%

  # adjustments to Biology and Economics
  mutate_cond(category == "Quantitative Biology", category = "Biology") %>%
  mutate_cond(category == "Quantitative Finance", category = "Economics") %>%
  
  # adjustments to socarxiv
  filter(subcategory != "Social and Behavioral Sciences") %>%
  mutate_cond(repository == "socarxiv" & subcategory %in% c(
    "Science and Technology Studies",
    "Environmental Studies",
    "International and Area Studies",
    "Leisure Studies",
    "Organization Development",
    "Leadership Studies"
    ), subcategory = "Other Social and Behavioral Sciences") %>%
  mutate_cond(repository == "socarxiv" & subcategory %in% c(
    "Library and Information Science",
    "Linguistics"
    ), subcategory = "Arts and Humanities") %>%
  mutate_cond(repository == "socarxiv" & subcategory %in% c(
    "Urban Studies and Planning",
    "Social Statistics"
    ), subcategory = "Sociology") %>%
  mutate_cond(repository == "socarxiv" & subcategory %in% c(
    "Social Work",
    "Legal Studies"
    ), subcategory = "Law") %>%
  mutate_cond(repository == "socarxiv" & subcategory %in% c(
    "Public Affairs, Public Policy and Public Administration"
    ), subcategory = "Political Science") %>%
  mutate_cond(repository == "socarxiv" & subcategory %in% c(
    "Agricultural and Resource Economics"
    ), subcategory = "Economics") %>%
  mutate_cond(id %in% filter(
    ., repository == "socarxiv" & subcategory == "Psychology")$id,
    category = "Psychology") %>%
  filter(subcategory != "Psychology") %>%
  na.omit() %>%
  distinct() %>%
  
  # get just main categories with more than 100 papers for socarxiv & psyarxiv
  filter((!repository %in% c("socarxiv", "psyarxiv")) | subcategory %in% unlist(
    lapply(c("socarxiv", "psyarxiv"), get_main, .)))

articles %>%
  group_by(category) %>%
  summarise(subcategories=length(unique(subcategory)), n=n())
text <- readr::read_csv(get_file("text.csv")) %>%
  select(id, title) %>%
  collect() %>%
  mutate(covidpaper = grepl("covid-19", title, ignore.case=TRUE) |
           grepl("sars-cov-2", title, ignore.case=TRUE) |
           grepl("coronavirus", title, ignore.case=TRUE))
## Warning: One or more parsing issues, see `problems()` for details
authors <- readr::read_csv(get_file("authors.csv")) %>%
  distinct() %>%
  collect() %>%
  mutate(across(probability, as.numeric)) %>%
  mutate(across(alphabetical_ordered, as.logical)) %>%
  mutate_cond(is.na(alphabetical_ordered), alphabetical_ordered=TRUE)

1.2 Feature engineering

merge_info <- function(.data) .data %>%
  left_join(articles, by="id") %>%
  left_join(text[, c("id", "covidpaper")], by="id") %>%
  distinct() %>%
  na.omit() %>%
  mutate_cond(year < 2020, covidpaper = FALSE) %>%
  group_by(subcategory) %>%
  filter(min(year) < 2020) %>%
  ungroup()

# features all
df.id <- authors %>%
  filter(probability >= 0.95 & count >= 10) %>%
  group_by(id) %>%
  summarise(n_male = sum(gender == "male", na.rm=TRUE),
            n_female = sum(gender == "female", na.rm=TRUE),
            n_na = sum(is.na(gender)),
            total = n_male + n_female, .groups="drop") %>%
  merge_info() %>%
  # remove papers with more than 25% of authors with missing gender
  filter(n_na < 0.25 * (n_na + n_male + n_female)) %>%
  # remove observations with less than 30 people per month
  group_by(repository, category, subcategory, covidpaper, lockdown, yearmon) %>%
  filter(sum(total) > 30) %>%
  ungroup()
df.all <- df.id %>%
  group_by(repository, category, subcategory, covidpaper, lockdown, yearmon) %>%
  summarise(p_male = sum(n_male) / sum(total),
            total = sum(total), .groups="drop")

summarise_solo <- function(.data) .data %>%
  group_by(repository, category, subcategory, covidpaper, lockdown, yearmon) %>%
  summarise(n_male = sum(gender == "male", na.rm=TRUE),
            n_female = sum(gender == "female", na.rm=TRUE),
            total = n_male + n_female,
            p_male = n_male / total, .groups="drop")

# features first
df.first <- authors %>%
  filter(probability >= 0.95 & count >= 10) %>%
  filter(!alphabetical_ordered & rank == "first") %>%
  merge_info() %>%
  summarise_solo()

# features last
df.last <- authors %>%
  filter(probability >= 0.95 & count >= 10) %>%
  filter(!alphabetical_ordered & rank == "last") %>%
  merge_info() %>%
  summarise_solo()

# features single
df.single <- authors %>%
  filter(probability >= 0.95 & count >= 10) %>%
  filter(rank == "single") %>%
  merge_info() %>%
  summarise_solo()

2 Descriptive

2.1 Pre-processing result

2.1.1 Number of unique papers

articles %>%
  select(id) %>%
  distinct() %>%
  count()

2.1.2 Number of subcategories per category

articles %>%
  select(category, subcategory) %>%
  distinct() %>%
  count(category)

2.1.3 Number of papers per category

articles %>%
  select(category, id) %>%
  distinct() %>%
  count(category)

2.2 Analysis of missings

2.2.1 Missing authors

authors %>%
  select(-id) %>%
  distinct() %>%
  summarise(
    missing = sum(is.na(probability)) / n(),
    discarded = sum(!is.na(probability) & (probability<0.95 | count<10)) / n(),
    total = n()
  )

2.2.2 Missing authors per year

articles %>%
  select(id, year) %>%
  distinct() %>%
  right_join(authors) %>%
  group_by(year) %>%
  summarise(
    missing = sum(is.na(probability)) / n(),
    discarded = sum(!is.na(probability) & (probability<0.95 | count<10)) / n(),
    total = n()
  ) %>%
  na.omit()
## Joining, by = "id"

2.3 Preprints considered

2.3.1 All authors

pp.all <- articles %>%
  select(id, year) %>%
  distinct() %>%
  count(year)
authors %>%
  filter(probability >= 0.95 & count >= 10) %>%
  group_by(id) %>%
  summarise(n_male = sum(gender == "male", na.rm=TRUE),
            n_female = sum(gender == "female", na.rm=TRUE),
            n_na = sum(is.na(gender)),
            total = n_male + n_female, .groups="drop") %>%
  merge_info() %>%
  # remove papers with more than 25% of authors with missing gender
  filter(n_na < 0.25 * (n_na + n_male + n_female)) %>%
  select(id, year) %>%
  distinct() %>%
  count(year) %>%
  mutate(prop = n / pp.all$n)

2.3.2 First authors

pp.first <- authors %>%
  filter(!alphabetical_ordered & rank == "first") %>%
  merge_info() %>%
  select(id, year) %>%
  distinct() %>%
  count(year)
authors %>%
  filter(probability >= 0.95 & count >= 10) %>%
  filter(!alphabetical_ordered & rank == "first") %>%
  merge_info() %>%
  select(id, year) %>%
  distinct() %>%
  count(year) %>%
  mutate(prop = n / pp.first$n)

2.3.3 Last authors

pp.last <- authors %>%
  filter(!alphabetical_ordered & rank == "last") %>%
  merge_info() %>%
  select(id, year) %>%
  distinct() %>%
  count(year)
authors %>%
  filter(probability >= 0.95 & count >= 10) %>%
  filter(!alphabetical_ordered & rank == "last") %>%
  merge_info() %>%
  select(id, year) %>%
  distinct() %>%
  count(year) %>%
  mutate(prop = n / pp.last$n)

2.3.4 Single authors

pp.single <- authors %>%
  filter(rank == "single") %>%
  merge_info() %>%
  select(id, year) %>%
  distinct() %>%
  count(year)
authors %>%
  filter(probability >= 0.95 & count >= 10) %>%
  filter(rank == "single") %>%
  merge_info() %>%
  select(id, year) %>%
  distinct() %>%
  count(year) %>%
  mutate(prop = n / pp.single$n)

2.4 Evolution over time

2.4.1 Number of papers per month

library(ggplot2)
theme_set(theme_bw())

df.plot.papers <- df.id %>%
  select(yearmon, category, id) %>%
  distinct() %>%
  count(yearmon, category)
df.plot.papers.all <- df.plot.papers %>%
  group_by(yearmon) %>%
  summarise(n = sum(n), .groups="drop") %>%
  mutate(category="(all)")

ggplot(df.plot.papers) +
  aes(yearmon+2017, n) + facet_wrap(~category, scales="free_y") +
  geom_col(data=df.plot.papers.all) +
  geom_smooth(data=df.plot.papers.all) +
  geom_col() + geom_smooth() +
  zoo::scale_x_yearmon(format="%Y") +
  labs(y="Number of papers per month", x=NULL)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

2.4.2 Number of authors per month

df.plot.authors <- df.id %>%
  select(yearmon, category, id, total) %>%
  distinct() %>%
  group_by(yearmon, category) %>%
  summarise(n = sum(total), .groups="drop")
df.plot.authors.all <- df.plot.authors %>%
  group_by(yearmon) %>%
  summarise(n = sum(n), .groups="drop") %>%
  mutate(category="(all)")

ggplot(df.plot.authors) +
  aes(yearmon+2017, n) + facet_wrap(~category, scales="free_y") +
  geom_col(data=df.plot.authors.all) +
  geom_smooth(data=df.plot.authors.all) +
  geom_col() + geom_smooth() +
  zoo::scale_x_yearmon(format="%Y") +
  labs(y="Number of authors per month", x=NULL)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

2.4.3 Proportion of males per month

df.plot.p_male <- df.id %>%
  select(yearmon, category, id, n_male, total) %>%
  distinct() %>%
  group_by(yearmon, category) %>%
  summarise(p_male = sum(n_male) / sum(total), .groups="drop")
df.plot.p_male.all <- df.id %>%
  group_by(yearmon) %>%
  summarise(p_male = sum(n_male) / sum(total), .groups="drop") %>%
  mutate(category="(all)")

ggplot(df.plot.p_male) +
  aes(yearmon+2017, p_male) + facet_wrap(~category) +
  geom_point(data=df.plot.p_male.all) +
  geom_smooth(method="gam", data=df.plot.p_male.all) +
  geom_point() + geom_smooth(method="gam") +
  zoo::scale_x_yearmon(format="%Y") +
  labs(y="Proportion of male authors", x=NULL)
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'

3 Modelling

3.1 Simple model

model.glm <- p_male ~ yearmon + lockdown + covidpaper
fit.glm <- glm(model.glm, df.all, family=binomial, weights=total)
summary(fit.glm)
## 
## Call:
## glm(formula = model.glm, family = binomial, data = df.all, weights = total)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -26.0616   -1.3754    0.9651    2.5727   16.1094  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     1.826402   0.006090 299.901   <2e-16 ***
## yearmon        -0.104284   0.003042 -34.283   <2e-16 ***
## lockdownTRUE    0.071570   0.007224   9.908   <2e-16 ***
## covidpaperTRUE -0.613698   0.019441 -31.568   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 64761  on 3367  degrees of freedom
## Residual deviance: 62173  on 3364  degrees of freedom
## AIC: 79001
## 
## Number of Fisher Scoring iterations: 4
model_performance(fit.glm)
## Warning in .recacheSubclasses(def@className, def, env): undefined subclass
## "numericVector" of class "Mnumeric"; definition not updated
## `geom_smooth()` using formula 'y ~ x'

performance::check_model(fit.glm)

3.2 Hierarchical model

library(lme4)
## Loading required package: Matrix
model <- p_male ~ yearmon + lockdown + covidpaper + (1 | category/subcategory)
fit.all <- glmer(model, df.all, family=binomial, weights=total)
summary(fit.all)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: p_male ~ yearmon + lockdown + covidpaper + (1 | category/subcategory)
##    Data: df.all
## Weights: total
## 
##      AIC      BIC   logLik deviance df.resid 
##  22667.8  22704.5 -11327.9  22655.8     3362 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.8130 -0.7773  0.0898  0.8462  6.2945 
## 
## Random effects:
##  Groups               Name        Variance Std.Dev.
##  subcategory:category (Intercept) 0.0852   0.2919  
##  category             (Intercept) 0.3838   0.6196  
## Number of obs: 3368, groups:  subcategory:category, 192; category, 10
## 
## Fixed effects:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     1.595286   0.199014   8.016 1.09e-15 ***
## yearmon        -0.049239   0.003202 -15.378  < 2e-16 ***
## lockdownTRUE    0.030571   0.007491   4.081 4.48e-05 ***
## covidpaperTRUE  0.076362   0.021791   3.504 0.000458 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) yearmn lcTRUE
## yearmon     -0.030              
## lockdwnTRUE  0.011 -0.629       
## covdpprTRUE -0.005 -0.001 -0.143
model_performance(fit.all)
## `geom_smooth()` using formula 'y ~ x'

performance::check_model(fit.all)

df.re.category <- sjPlot::get_model_data(fit.all, "re")[[2]] %>%
  mutate(group = ifelse(conf.low < 1 & conf.high > 1, NA, group)) %>%
  mutate(term = forcats::fct_reorder(term, estimate))
## Registered S3 methods overwritten by 'parameters':
##   method                           from      
##   as.double.parameters_kurtosis    datawizard
##   as.double.parameters_skewness    datawizard
##   as.double.parameters_smoothness  datawizard
##   as.numeric.parameters_kurtosis   datawizard
##   as.numeric.parameters_skewness   datawizard
##   as.numeric.parameters_smoothness datawizard
##   print.parameters_distribution    datawizard
##   print.parameters_kurtosis        datawizard
##   print.parameters_skewness        datawizard
##   summary.parameters_kurtosis      datawizard
##   summary.parameters_skewness      datawizard
## Warning in checkDepPackageVersion(dep_pkg = "TMB"): Package version inconsistency detected.
## glmmTMB was built with TMB version 1.7.21
## Current TMB version is 1.7.22
## Please re-install glmmTMB from source or restore original 'TMB' package (see '?reinstalling' for more information)
ggplot(df.re.category) +
  aes(estimate, term, xmin=conf.low, xmax=conf.high, color=group) +
  geom_vline(xintercept=1, color="black", linetype="dashed") +
  geom_errorbarh(height=0, size=1) + geom_point(size=2) +
  scale_x_log10(breaks=c(0.2, 0.5, 1, 2, 5)) +
  scale_color_manual(
    breaks=c("pos", "neg"), values=c("blue", "red"),
    na.value=grDevices::adjustcolor("grey50", 0.5)) +
  labs(y=NULL, x="Odds ratio", title="Random effect: category") +
  theme(legend.position="none")

df.re.subcategory <- sjPlot::get_model_data(fit.all, "re")[[1]] %>%
  mutate(group = ifelse(conf.low < 1 & conf.high > 1, NA, group)) %>%
  mutate(facet = sapply(strsplit(as.character(term), ":"), tail, 1)) %>%
  mutate(term = forcats::fct_reorder(term, estimate))

ggplot(df.re.subcategory) +
  aes(estimate, term, xmin=conf.low, xmax=conf.high, color=group) +
  facet_grid("facet", scales="free_y", space="free_y") +
  geom_vline(xintercept=1, color="black", linetype="dashed") +
  geom_errorbarh(height=0, size=1) + geom_point(size=2) +
  scale_x_log10(breaks=c(0.2, 0.5, 1, 2, 5)) +
  scale_color_manual(
    breaks=c("pos", "neg"), values=c("blue", "red"),
    na.value=grDevices::adjustcolor("grey50", 0.5)) +
  labs(y=NULL, x="Odds ratio", title="Random effect: subcategory") +
  theme(legend.position="none")

3.3 All together

fit <- list(
  glm = fit.glm,
  all = fit.all,
  first = glmer(model, df.first, family=binomial, weights=total),
  last = glmer(model, df.last, family=binomial, weights=total),
  single = glmer(model, df.single, family=binomial, weights=total)
)

df.terms <- lapply(fit, sjPlot::get_model_data, "est") %>%
  bind_rows(.id="fit") %>%
  mutate(type = ifelse(fit=="glm", "GLM", "GLMM")) %>%
  mutate(fit = ifelse(fit=="glm", "all", fit)) %>%
  mutate(fit = forcats::fct_inorder(fit)) %>%
  mutate(type = ifelse(p.value >= 0.05, NA, type))

levels(df.terms$term) <- c("COVID paper", "lockdown", "year")
stargazer::stargazer(fit, type="html")
Dependent variable:
p_male
logistic generalized linear
mixed-effects
(1) (2) (3) (4) (5)
yearmon -0.104*** -0.049*** -0.044*** -0.059*** -0.029
(0.003) (0.003) (0.009) (0.009) (0.018)
lockdown 0.072*** 0.031*** 0.035* 0.008 0.136***
(0.007) (0.007) (0.020) (0.021) (0.052)
covidpaper -0.614*** 0.076*** 0.399*** 0.129** 0.715**
(0.019) (0.022) (0.064) (0.065) (0.298)
Constant 1.826*** 1.595*** 1.520*** 1.819*** 2.480***
(0.006) (0.199) (0.195) (0.173) (0.169)
Observations 3,368 3,368 3,996 4,027 3,517
Log Likelihood -39,496.310 -11,327.900 -7,182.127 -6,955.634 -3,788.911
Akaike Inf. Crit. 79,000.620 22,667.800 14,376.250 13,923.270 7,589.823
Bayesian Inf. Crit. 22,704.540 14,414.010 13,961.070 7,626.815
Note: p<0.1; p<0.05; p<0.01
ggplot(df.terms) +
  aes(estimate, term, xmin=conf.low, xmax=conf.high, color=type) +
  facet_grid("fit", scales="free_y", space="free_y") +
  geom_vline(xintercept=1, color="black", linetype="dashed") +
  geom_errorbarh(height=0, size=1) + geom_point(size=2) +
  ggrepel::geom_text_repel(
    aes(label=p.label), nudge_y=0.3, nudge_x=0.01, segment.size=0, show.legend=FALSE) +
  scale_x_log10(breaks=c(0.2, 0.5, 1, 2, 5)) +
  scale_color_manual(
    breaks=c("GLM", "GLMM"), values=c("red", "black"),
    na.value=grDevices::adjustcolor("grey50", 0.5)) +
  labs(y=NULL, x="Odds ratio", color="Model") +
  theme(legend.position=c(.99, .99), legend.justification=c(1, 1)) +
  theme(legend.background=element_rect(fill="#ffffff88"))