Functions

get_file <- function(x) {
  dir.create("data", showWarnings=FALSE)
  file <- file.path("data", x)
  if (!file.exists(file)) download.file(
    paste0("https://zenodo.org/record/5142676/files/", x, "?download=1"), file)
  file
}

mutate_cond <- function(.data, condition, ..., envir = parent.frame()) {
  require(dplyr)
  
  condition <- eval(substitute(condition), .data, envir)
  .data[condition, ] <- mutate(.data[condition, ], ...)
  .data
}

get_main <- function(repo, data, n=100) {
  require(dplyr)
  
  subcat <- data %>%
    filter(repository %in% repo) %>%
    group_by(subcategory) %>%
    summarise(weight = n(), .groups="drop") %>%
    arrange(desc(weight)) %>%
    pull(subcategory)
  
  data %>%
    filter(repository %in% repo) %>%
    group_by(id) %>%
    summarise(main = subcat[subcat %in% subcategory][1], .groups="drop") %>%
    group_by(main) %>%
    summarise(N = n(), .groups="drop") %>%
    arrange(desc(N)) %>%
    filter(N >= n) %>%
    pull(main)
}

# performance
model_performance <- function(m) {
  require(ggplot2)
  
  # obs. vs fitted
  df <- data.frame(fitted=fitted(m), response=model.response(model.frame(m)))
  ggplot(df) + aes(fitted, response) +
    geom_point(alpha=0.3) + geom_abline() + geom_smooth(method="lm") +
    ggpmisc::stat_poly_eq(formula=y~x, parse=TRUE)
}

1 Data preparation

1.1 Cleaning and filtering

library(dplyr)

articles <- readr::read_csv(get_file("categories.csv")) %>%
  left_join(readr::read_csv(get_file("articles.csv")), by="id") %>%
  rename(subcategory = subcategory_name) %>%
  select(id, repository, category, subcategory, date) %>%
  distinct() %>%
  collect() %>%
  
  # define main categories for repo != arXiv
  mutate_cond(subcategory %in% c("Epidemiology", "Clinical Trials"), repository = "medrxiv") %>%
  mutate_cond(repository == "medrxiv", category = "Health Sciences") %>%
  mutate_cond(repository == "biorxiv", category = "Biology") %>%
  mutate_cond(repository == "psyarxiv", category = "Psychology") %>%
  mutate_cond(repository == "socarxiv", category = "Social Sciences")

articles %>%
  group_by(repository, category) %>%
  summarise(subcategories=length(unique(subcategory)), n=n())

articles <- articles %>%
  # mark lockdown period & move time reference to 0
  mutate(year = lubridate::year(date), month = lubridate::month(date)) %>%
  mutate(lockdown = year == 2020 & month > 2) %>%
  mutate(yearmon = (year - min(year)) + (month - 1) / 12) %>%

  # adjustments to Biology and Economics
  mutate_cond(category == "Quantitative Biology", category = "Biology") %>%
  mutate_cond(category == "Quantitative Finance", category = "Economics") %>%
  
  # adjustments to socarxiv
  filter(subcategory != "Social and Behavioral Sciences") %>%
  mutate_cond(repository == "socarxiv" & subcategory %in% c(
    "Science and Technology Studies",
    "Environmental Studies",
    "International and Area Studies",
    "Leisure Studies",
    "Organization Development",
    "Leadership Studies"
    ), subcategory = "Other Social and Behavioral Sciences") %>%
  mutate_cond(repository == "socarxiv" & subcategory %in% c(
    "Library and Information Science",
    "Linguistics"
    ), subcategory = "Arts and Humanities") %>%
  mutate_cond(repository == "socarxiv" & subcategory %in% c(
    "Urban Studies and Planning",
    "Social Statistics"
    ), subcategory = "Sociology") %>%
  mutate_cond(repository == "socarxiv" & subcategory %in% c(
    "Social Work",
    "Legal Studies"
    ), subcategory = "Law") %>%
  mutate_cond(repository == "socarxiv" & subcategory %in% c(
    "Public Affairs, Public Policy and Public Administration"
    ), subcategory = "Political Science") %>%
  mutate_cond(repository == "socarxiv" & subcategory %in% c(
    "Agricultural and Resource Economics"
    ), subcategory = "Economics") %>%
  mutate_cond(id %in% filter(
    ., repository == "socarxiv" & subcategory == "Psychology")$id,
    category = "Psychology") %>%
  filter(subcategory != "Psychology") %>%
  na.omit() %>%
  distinct() %>%
  
  # get just main categories with more than 100 papers for socarxiv & psyarxiv
  filter((!repository %in% c("socarxiv", "psyarxiv")) | subcategory %in% unlist(
    lapply(c("socarxiv", "psyarxiv"), get_main, .)))

articles %>%
  group_by(category) %>%
  summarise(subcategories=length(unique(subcategory)), n=n())

text <- readr::read_csv(get_file("text.csv")) %>%
  select(id, title) %>%
  collect() %>%
  mutate(covidpaper = grepl("covid-19", title, ignore.case=TRUE) |
           grepl("sars-cov-2", title, ignore.case=TRUE) |
           grepl("coronavirus", title, ignore.case=TRUE))

## Warning: One or more parsing issues, see `problems()` for details

authors <- readr::read_csv(get_file("authors.csv")) %>%
  distinct() %>%
  collect() %>%
  mutate(across(probability, as.numeric)) %>%
  mutate(across(alphabetical_ordered, as.logical)) %>%
  mutate_cond(is.na(alphabetical_ordered), alphabetical_ordered=TRUE)

1.2 Feature engineering

merge_info <- function(.data) .data %>%
  left_join(articles, by="id") %>%
  left_join(text[, c("id", "covidpaper")], by="id") %>%
  distinct() %>%
  na.omit() %>%
  mutate_cond(year < 2020, covidpaper = FALSE) %>%
  group_by(subcategory) %>%
  filter(min(year) < 2020) %>%
  ungroup()

# features all
df.id <- authors %>%
  filter(probability >= 0.95 & count >= 10) %>%
  group_by(id) %>%
  summarise(n_male = sum(gender == "male", na.rm=TRUE),
            n_female = sum(gender == "female", na.rm=TRUE),
            n_na = sum(is.na(gender)),
            total = n_male + n_female, .groups="drop") %>%
  merge_info() %>%
  # remove papers with more than 25% of authors with missing gender
  filter(n_na < 0.25 * (n_na + n_male + n_female)) %>%
  # remove observations with less than 30 people per month
  group_by(repository, category, subcategory, covidpaper, lockdown, yearmon) %>%
  filter(sum(total) > 30) %>%
  ungroup()
df.all <- df.id %>%
  group_by(repository, category, subcategory, covidpaper, lockdown, yearmon) %>%
  summarise(p_male = sum(n_male) / sum(total),
            total = sum(total), .groups="drop")

summarise_solo <- function(.data) .data %>%
  group_by(repository, category, subcategory, covidpaper, lockdown, yearmon) %>%
  summarise(n_male = sum(gender == "male", na.rm=TRUE),
            n_female = sum(gender == "female", na.rm=TRUE),
            total = n_male + n_female,
            p_male = n_male / total, .groups="drop")

# features first
df.first <- authors %>%
  filter(probability >= 0.95 & count >= 10) %>%
  filter(!alphabetical_ordered & rank == "first") %>%
  merge_info() %>%
  summarise_solo()

# features last
df.last <- authors %>%
  filter(probability >= 0.95 & count >= 10) %>%
  filter(!alphabetical_ordered & rank == "last") %>%
  merge_info() %>%
  summarise_solo()

# features single
df.single <- authors %>%
  filter(probability >= 0.95 & count >= 10) %>%
  filter(rank == "single") %>%
  merge_info() %>%
  summarise_solo()

2 Descriptive

2.1 Pre-processing result

2.1.1 Number of unique papers

articles %>%
  select(id) %>%
  distinct() %>%
  count()

2.1.2 Number of subcategories per category

articles %>%
  select(category, subcategory) %>%
  distinct() %>%
  count(category)

2.1.3 Number of papers per category

articles %>%
  select(category, id) %>%
  distinct() %>%
  count(category)

2.2 Analysis of missings

2.2.1 Missing authors

authors %>%
  select(-id) %>%
  distinct() %>%
  summarise(
    missing = sum(is.na(probability)) / n(),
    discarded = sum(!is.na(probability) & (probability<0.95 | count<10)) / n(),
    total = n()
  )

2.2.2 Missing authors per year

articles %>%
  select(id, year) %>%
  distinct() %>%
  right_join(authors) %>%
  group_by(year) %>%
  summarise(
    missing = sum(is.na(probability)) / n(),
    discarded = sum(!is.na(probability) & (probability<0.95 | count<10)) / n(),
    total = n()
  ) %>%
  na.omit()

## Joining, by = "id"

2.3 Preprints considered

2.3.1 All authors

pp.all <- articles %>%
  select(id, year) %>%
  distinct() %>%
  count(year)
authors %>%
  filter(probability >= 0.95 & count >= 10) %>%
  group_by(id) %>%
  summarise(n_male = sum(gender == "male", na.rm=TRUE),
            n_female = sum(gender == "female", na.rm=TRUE),
            n_na = sum(is.na(gender)),
            total = n_male + n_female, .groups="drop") %>%
  merge_info() %>%
  # remove papers with more than 25% of authors with missing gender
  filter(n_na < 0.25 * (n_na + n_male + n_female)) %>%
  select(id, year) %>%
  distinct() %>%
  count(year) %>%
  mutate(prop = n / pp.all$n)

2.3.2 First authors

pp.first <- authors %>%
  filter(!alphabetical_ordered & rank == "first") %>%
  merge_info() %>%
  select(id, year) %>%
  distinct() %>%
  count(year)
authors %>%
  filter(probability >= 0.95 & count >= 10) %>%
  filter(!alphabetical_ordered & rank == "first") %>%
  merge_info() %>%
  select(id, year) %>%
  distinct() %>%
  count(year) %>%
  mutate(prop = n / pp.first$n)

2.3.3 Last authors

pp.last <- authors %>%
  filter(!alphabetical_ordered & rank == "last") %>%
  merge_info() %>%
  select(id, year) %>%
  distinct() %>%
  count(year)
authors %>%
  filter(probability >= 0.95 & count >= 10) %>%
  filter(!alphabetical_ordered & rank == "last") %>%
  merge_info() %>%
  select(id, year) %>%
  distinct() %>%
  count(year) %>%
  mutate(prop = n / pp.last$n)

2.3.4 Single authors

pp.single <- authors %>%
  filter(rank == "single") %>%
  merge_info() %>%
  select(id, year) %>%
  distinct() %>%
  count(year)
authors %>%
  filter(probability >= 0.95 & count >= 10) %>%
  filter(rank == "single") %>%
  merge_info() %>%
  select(id, year) %>%
  distinct() %>%
  count(year) %>%
  mutate(prop = n / pp.single$n)

2.4 Evolution over time

2.4.1 Number of papers per month

library(ggplot2)
theme_set(theme_bw())

df.plot.papers <- df.id %>%
  select(yearmon, category, id) %>%
  distinct() %>%
  count(yearmon, category)
df.plot.papers.all <- df.plot.papers %>%
  group_by(yearmon) %>%
  summarise(n = sum(n), .groups="drop") %>%
  mutate(category="(all)")

ggplot(df.plot.papers) +
  aes(yearmon+2017, n) + facet_wrap(~category, scales="free_y") +
  geom_col(data=df.plot.papers.all) +
  geom_smooth(data=df.plot.papers.all) +
  geom_col() + geom_smooth() +
  zoo::scale_x_yearmon(format="%Y") +
  labs(y="Number of papers per month", x=NULL)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

2.4.2 Number of authors per month

df.plot.authors <- df.id %>%
  select(yearmon, category, id, total) %>%
  distinct() %>%
  group_by(yearmon, category) %>%
  summarise(n = sum(total), .groups="drop")
df.plot.authors.all <- df.plot.authors %>%
  group_by(yearmon) %>%
  summarise(n = sum(n), .groups="drop") %>%
  mutate(category="(all)")

ggplot(df.plot.authors) +
  aes(yearmon+2017, n) + facet_wrap(~category, scales="free_y") +
  geom_col(data=df.plot.authors.all) +
  geom_smooth(data=df.plot.authors.all) +
  geom_col() + geom_smooth() +
  zoo::scale_x_yearmon(format="%Y") +
  labs(y="Number of authors per month", x=NULL)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

2.4.3 Proportion of males per month

df.plot.p_male <- df.id %>%
  select(yearmon, category, id, n_male, total) %>%
  distinct() %>%
  group_by(yearmon, category) %>%
  summarise(p_male = sum(n_male) / sum(total), .groups="drop")
df.plot.p_male.all <- df.id %>%
  group_by(yearmon) %>%
  summarise(p_male = sum(n_male) / sum(total), .groups="drop") %>%
  mutate(category="(all)")

ggplot(df.plot.p_male) +
  aes(yearmon+2017, p_male) + facet_wrap(~category) +
  geom_point(data=df.plot.p_male.all) +
  geom_smooth(method="gam", data=df.plot.p_male.all) +
  geom_point() + geom_smooth(method="gam") +
  zoo::scale_x_yearmon(format="%Y") +
  labs(y="Proportion of male authors", x=NULL)

## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'

3 Modelling

3.1 Simple model

model.glm <- p_male ~ yearmon + lockdown + covidpaper
fit.glm <- glm(model.glm, df.all, family=binomial, weights=total)
summary(fit.glm)

## 
## Call:
## glm(formula = model.glm, family = binomial, data = df.all, weights = total)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -26.0616   -1.3754    0.9651    2.5727   16.1094  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     1.826402   0.006090 299.901   <2e-16 ***
## yearmon        -0.104284   0.003042 -34.283   <2e-16 ***
## lockdownTRUE    0.071570   0.007224   9.908   <2e-16 ***
## covidpaperTRUE -0.613698   0.019441 -31.568   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 64761  on 3367  degrees of freedom
## Residual deviance: 62173  on 3364  degrees of freedom
## AIC: 79001
## 
## Number of Fisher Scoring iterations: 4

model_performance(fit.glm)

## Warning in .recacheSubclasses(def@className, def, env): undefined subclass
## "numericVector" of class "Mnumeric"; definition not updated

## `geom_smooth()` using formula 'y ~ x'

performance::check_model(fit.glm)

3.2 Hierarchical model

library(lme4)

## Loading required package: Matrix

model <- p_male ~ yearmon + lockdown + covidpaper + (1 | category/subcategory)
fit.all <- glmer(model, df.all, family=binomial, weights=total)
summary(fit.all)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: p_male ~ yearmon + lockdown + covidpaper + (1 | category/subcategory)
##    Data: df.all
## Weights: total
## 
##      AIC      BIC   logLik deviance df.resid 
##  22667.8  22704.5 -11327.9  22655.8     3362 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.8130 -0.7773  0.0898  0.8462  6.2945 
## 
## Random effects:
##  Groups               Name        Variance Std.Dev.
##  subcategory:category (Intercept) 0.0852   0.2919  
##  category             (Intercept) 0.3838   0.6196  
## Number of obs: 3368, groups:  subcategory:category, 192; category, 10
## 
## Fixed effects:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     1.595286   0.199014   8.016 1.09e-15 ***
## yearmon        -0.049239   0.003202 -15.378  < 2e-16 ***
## lockdownTRUE    0.030571   0.007491   4.081 4.48e-05 ***
## covidpaperTRUE  0.076362   0.021791   3.504 0.000458 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) yearmn lcTRUE
## yearmon     -0.030              
## lockdwnTRUE  0.011 -0.629       
## covdpprTRUE -0.005 -0.001 -0.143

model_performance(fit.all)

## `geom_smooth()` using formula 'y ~ x'

performance::check_model(fit.all)

df.re.category <- sjPlot::get_model_data(fit.all, "re")[[2]] %>%
  mutate(group = ifelse(conf.low < 1 & conf.high > 1, NA, group)) %>%
  mutate(term = forcats::fct_reorder(term, estimate))

## Registered S3 methods overwritten by 'parameters':
##   method                           from      
##   as.double.parameters_kurtosis    datawizard
##   as.double.parameters_skewness    datawizard
##   as.double.parameters_smoothness  datawizard
##   as.numeric.parameters_kurtosis   datawizard
##   as.numeric.parameters_skewness   datawizard
##   as.numeric.parameters_smoothness datawizard
##   print.parameters_distribution    datawizard
##   print.parameters_kurtosis        datawizard
##   print.parameters_skewness        datawizard
##   summary.parameters_kurtosis      datawizard
##   summary.parameters_skewness      datawizard

## Warning in checkDepPackageVersion(dep_pkg = "TMB"): Package version inconsistency detected.
## glmmTMB was built with TMB version 1.7.21
## Current TMB version is 1.7.22
## Please re-install glmmTMB from source or restore original 'TMB' package (see '?reinstalling' for more information)

ggplot(df.re.category) +
  aes(estimate, term, xmin=conf.low, xmax=conf.high, color=group) +
  geom_vline(xintercept=1, color="black", linetype="dashed") +
  geom_errorbarh(height=0, size=1) + geom_point(size=2) +
  scale_x_log10(breaks=c(0.2, 0.5, 1, 2, 5)) +
  scale_color_manual(
    breaks=c("pos", "neg"), values=c("blue", "red"),
    na.value=grDevices::adjustcolor("grey50", 0.5)) +
  labs(y=NULL, x="Odds ratio", title="Random effect: category") +
  theme(legend.position="none")

df.re.subcategory <- sjPlot::get_model_data(fit.all, "re")[[1]] %>%
  mutate(group = ifelse(conf.low < 1 & conf.high > 1, NA, group)) %>%
  mutate(facet = sapply(strsplit(as.character(term), ":"), tail, 1)) %>%
  mutate(term = forcats::fct_reorder(term, estimate))

ggplot(df.re.subcategory) +
  aes(estimate, term, xmin=conf.low, xmax=conf.high, color=group) +
  facet_grid("facet", scales="free_y", space="free_y") +
  geom_vline(xintercept=1, color="black", linetype="dashed") +
  geom_errorbarh(height=0, size=1) + geom_point(size=2) +
  scale_x_log10(breaks=c(0.2, 0.5, 1, 2, 5)) +
  scale_color_manual(
    breaks=c("pos", "neg"), values=c("blue", "red"),
    na.value=grDevices::adjustcolor("grey50", 0.5)) +
  labs(y=NULL, x="Odds ratio", title="Random effect: subcategory") +
  theme(legend.position="none")

3.3 All together

fit <- list(
  glm = fit.glm,
  all = fit.all,
  first = glmer(model, df.first, family=binomial, weights=total),
  last = glmer(model, df.last, family=binomial, weights=total),
  single = glmer(model, df.single, family=binomial, weights=total)
)

df.terms <- lapply(fit, sjPlot::get_model_data, "est") %>%
  bind_rows(.id="fit") %>%
  mutate(type = ifelse(fit=="glm", "GLM", "GLMM")) %>%
  mutate(fit = ifelse(fit=="glm", "all", fit)) %>%
  mutate(fit = forcats::fct_inorder(fit)) %>%
  mutate(type = ifelse(p.value >= 0.05, NA, type))

levels(df.terms$term) <- c("COVID paper", "lockdown", "year")

stargazer::stargazer(fit, type="html")


	Dependent variable:

	p_male
	logistic	generalized linear
		mixed-effects
	(1)	(2)	(3)	(4)	(5)

yearmon	-0.104^***	-0.049^***	-0.044^***	-0.059^***	-0.029
	(0.003)	(0.003)	(0.009)	(0.009)	(0.018)

lockdown	0.072^***	0.031^***	0.035^*	0.008	0.136^***
	(0.007)	(0.007)	(0.020)	(0.021)	(0.052)

covidpaper	-0.614^***	0.076^***	0.399^***	0.129^**	0.715^**
	(0.019)	(0.022)	(0.064)	(0.065)	(0.298)

Constant	1.826^***	1.595^***	1.520^***	1.819^***	2.480^***
	(0.006)	(0.199)	(0.195)	(0.173)	(0.169)


Observations	3,368	3,368	3,996	4,027	3,517
Log Likelihood	-39,496.310	-11,327.900	-7,182.127	-6,955.634	-3,788.911
Akaike Inf. Crit.	79,000.620	22,667.800	14,376.250	13,923.270	7,589.823
Bayesian Inf. Crit.		22,704.540	14,414.010	13,961.070	7,626.815

Note:	p<0.1; p<0.05; p<0.01

ggplot(df.terms) +
  aes(estimate, term, xmin=conf.low, xmax=conf.high, color=type) +
  facet_grid("fit", scales="free_y", space="free_y") +
  geom_vline(xintercept=1, color="black", linetype="dashed") +
  geom_errorbarh(height=0, size=1) + geom_point(size=2) +
  ggrepel::geom_text_repel(
    aes(label=p.label), nudge_y=0.3, nudge_x=0.01, segment.size=0, show.legend=FALSE) +
  scale_x_log10(breaks=c(0.2, 0.5, 1, 2, 5)) +
  scale_color_manual(
    breaks=c("GLM", "GLMM"), values=c("red", "black"),
    na.value=grDevices::adjustcolor("grey50", 0.5)) +
  labs(y=NULL, x="Odds ratio", color="Model") +
  theme(legend.position=c(.99, .99), legend.justification=c(1, 1)) +
  theme(legend.background=element_rect(fill="#ffffff88"))

Mind the gender gap: COVID-19 lockdown effects on gender differences in research

Iñaki Úcar, Margarita Torre, Antonio Elías

1 Data preparation

1.1 Cleaning and filtering

1.2 Feature engineering

2 Descriptive

2.1 Pre-processing result

2.1.1 Number of unique papers

2.1.2 Number of subcategories per category

2.1.3 Number of papers per category

2.2 Analysis of missings

2.2.1 Missing authors

2.2.2 Missing authors per year

2.3 Preprints considered

2.3.1 All authors

2.3.2 First authors

2.3.3 Last authors

2.3.4 Single authors

2.4 Evolution over time

2.4.1 Number of papers per month

2.4.2 Number of authors per month

2.4.3 Proportion of males per month

3 Modelling

3.1 Simple model

3.2 Hierarchical model

3.3 All together