Coverage of DOAJ journals in WoS and Scopus dataset

Author
Affiliation

Philippe Mongeon

Dalhousie University

Published

March 18, 2024

Data sources

Create database

Code
# 
# # ----------------------------
# # countries
# # ----------------------------
# 
# dbExecute(db,
#           "CREATE TABLE countries (
#             country_code CHAR(2),
#             economy TEXT,
#             region TEXT,
#             income_group TEXT,
#             PRIMARY KEY (country_code)
#             );")
# 
# # ----------------------------
# # doaj_countries_publishers
# # ----------------------------
# 
# dbExecute(db,
#           "CREATE TABLE doaj_countries_publishers (
#             id SMALLINT,
#             country TEXT,
#             country_code CHAR(2),
#             PRIMARY KEY (id),
#             FOREIGN KEY (country_code) REFERENCES _countries (country_code)
#             );
#           ")
# 
# # ----------------------------
# # doaj
# # ----------------------------
# 
# dbExecute(db,
#           "CREATE TABLE doaj_journals (
#             id TEXT,
#             title TEXT,
#             oa_start_year SMALLINT,
#             country_publisher_id SMALLINT,
#             oa_type TEXT,
#             PRIMARY KEY (id),
#             FOREIGN KEY (country_publisher_id) 
#               REFERENCES _doaj_countries_publishers (id)
#           );")
# 
# # ----------------------------
# # doaj_issns
# # ----------------------------
# 
# dbExecute(db,
#           "CREATE TABLE doaj_issns (
#             doaj_id TEXT,
#             issn TEXT,
#             type TEXT,
#           PRIMARY KEY (doaj_id, issn),
#           FOREIGN KEY (doaj_id) REFERENCES _doaj (id));")
# 
# # ----------------------------
# # doaj_languages
# # ----------------------------
# 
# dbExecute(db,
#           "CREATE TABLE doaj_languages (
#             doaj_id TEXT,
#             language TEXT,
#           PRIMARY KEY (doaj_id, language),
#           FOREIGN KEY (doaj_id) REFERENCES _doaj (id));")
# 
# # ----------------------------
# # journals_disciplines
# # ----------------------------
# 
# dbExecute(db, 
#   "CREATE TABLE journals_disciplines (
#     doaj_id text,
#     discipline varchar(15),
#   PRIMARY KEY (doaj_id, discipline),
#   FOREIGN KEY (doaj_id) REFERENCES doaj (id));")

Data processing

Country income level

Code
countries <- read_csv("data/country_classification.csv")

DOAJ

Import DOAJ journal list

Code
# Load DOAJ journal list
doaj_data <- read_csv("data/journalcsv__doaj_20240306_1820_utf8.csv") %>% 
  select(id = `URL in DOAJ`,
         title = `Journal title`,
         oa_start_year = `When did the journal start to publish all content using an open license?`,
         alternative_title = `Alternative title`,
         issn = `Journal ISSN (print version)`,
         eissn = `Journal EISSN (online version)`,
         language = `Languages in which the journal accepts manuscripts`,
         country_publisher = `Country of publisher`,
         former_issn = `Continues`,
         new_issn = `Continued By`,
         apc = APC) %>% 
  mutate(id = str_remove(id, "https://doaj.org/toc/"))

Publishing countries

Code
doaj_countries_publishers <- doaj_data %>% 
  left_join(countries, by=c("country_publisher"="economy")) %>% 
  select(country_publisher, country_code) %>% 
  unique() %>% 
  rownames_to_column("id")

DOAJ journals

Code
doaj_journals <- doaj_data %>%    
  mutate(id = str_remove(id, "https://doaj.org/toc/")) %>%    
  mutate(oa_type = ifelse(apc == "Yes","Gold","Diamond")) %>% 
  left_join(doaj_countries_publishers %>% 
              rename(country_publisher_id = id),
            by="country_publisher") %>% 
  select(id, title, oa_start_year, country_publisher_id, oa_type)

Language

Code
doaj_languages <- doaj_data %>% 
  select(doaj_id = id, language) %>% 
  separate_rows(language, sep = ",") %>% 
  mutate(language = str_squish(language)) %>% 
  unique()

ISSNs

Code
   doaj_issns <- bind_rows(doaj_data %>%
                         select(doaj_id = id, issn) %>%
                         mutate(type = "issn"),
                       doaj_data %>%
                         select(doaj_id = id, issn = eissn) %>% 
                         mutate(type = "eissn"),
                       doaj_data %>%
                         select(doaj_id = id, issn = former_issn) %>% 
                         mutate(type = "former issn"),
                       doaj_data %>%
                         select(doaj_id = id, issn = new_issn) %>% 
                         mutate(type = "new issn")) %>% 
     unique() %>% 
     mutate(doaj_id = str_remove(doaj_id, "https://doaj.org/toc/"))

Journal titles

Code
doaj_titles <- doaj_data %>% 
  select(doaj_id = id, title) %>% 
  mutate(type = "main_title") %>% 
    bind_rows(doaj_data %>% 
                select(doaj_id = id, title = alternative_title) %>% 
                mutate(type = "alternative_title")) %>% 
  drop_na() %>% 
  unique()

Scopus

We do not exclude journals that were discontinued from Scopus in January 2024, and do not include the newly accepted journals in the process of being added to Scopus.

Import Scopus journal list

Code
scopus_data <- read_xlsx("data/ext_list_February_2024.xlsx", sheet = 1) %>% 
  select(id = 1,
         title = 2,
         issn = 3,
         eissn = 4,
         active = `Active or Inactive`,
         discontinued_quality_issues = 7,
         language = 8,
         oa_status = 10,
         type = 13,
         related_title1 = 15,
         related_title2 = 16,
         related_title3 = 17,
         related_title4 = 18,
         publisher = 19,
         life_sciences = 22,
         social_sciences = 23,
         physical_sciences = 24,
         health_sciences = 25
         ) %>% 
  mutate(life_sciences = as.logical(ifelse(is.na(life_sciences),F,T))) %>% 
  mutate(social_sciences = as.logical(ifelse(is.na(social_sciences),F,T))) %>% 
  mutate(physical_sciences = as.logical(ifelse(is.na(physical_sciences),F,T))) %>% 
  mutate(health_sciences = as.logical(ifelse(is.na(health_sciences),F,T))) %>% 
  mutate(active = as.logical(ifelse(active == "Active",T,F))) %>% 
  mutate(active = as.logical(ifelse(discontinued_quality_issues == "Discontinued by Scopus",T,F)))

Journals

Code
scopus_journals <- scopus_data %>% 
  select(id, title, active, discontinued_quality_issues, type, publisher,
         life_sciences, social_sciences, physical_sciences,
         health_sciences)

ISSNs

Code
scopus_issns <- bind_rows(scopus_data %>%
                            select(scopus_id = id, issn) %>%
                            mutate(type = "issn") %>%
                            drop_na() %>% 
                            mutate(issn = ifelse(str_length(issn)==7,
                                                 str_c("0",issn),
                                                 issn)) %>% 
                            mutate(issn = str_c(str_sub(issn,1,4),
                                                "-",
                                                str_sub(issn,5,8))),
                          scopus_data %>%
                            select(scopus_id = id, issn = eissn) %>% 
                            mutate(type = "eissn") %>% 
                            drop_na() %>% 
                            mutate(issn = ifelse(str_length(issn) ==7,
                                                 str_c("0",issn),
                                                 issn)) %>% 
                            mutate(issn = str_c(str_sub(issn,1,4),
                                                "-",
                                                str_sub(issn,5,8)))) %>%
  unique()

Journal titles

Code
scopus_titles <- bind_rows(scopus_data %>% 
                             select(scopus_id = id, title) %>% 
                             mutate(type = "title"),
                           scopus_data %>% 
                             select(scopus_id = id, title = related_title1) %>% 
                             mutate(type = "related_title"),
                           scopus_data %>% 
                             select(scopus_id = id, title = related_title2) %>% 
                             mutate(type = "related_title"),
                           scopus_data %>% 
                             select(scopus_id = id, title = related_title3) %>% 
                             mutate(type = "related_title"),
                           scopus_data %>% 
                             select(scopus_id = id, title = related_title4) %>% 
                             mutate(type = "related_title")) %>% 
  drop_na() %>% 
  unique() 

# Manually clean journal titles with square brackets
scopus_titles %>% 
  filter(str_detect(title, "\\[")) %>% 
  writexl::write_xlsx("data/scopus_journal_titles_to_clean.xlsx")

scopus_titles <- bind_rows(scopus_titles %>% 
  filter(!str_detect(title, "\\[")),
  read_xlsx("data/scopus_journal_titles_cleaned.xlsx")) %>% 
  unique()

Web of Science

Import WOS journal list

Code
wos_data <- bind_rows(read_csv("data/wos-core_AHCI 2024-February-19.csv") %>%
                        mutate(index = "ahci"),
                      esci<-read_csv("data/wos-core_ESCI 2024-February-19.csv") %>%
                        mutate(index = "esci"),
                      ssci<-read_csv("data/wos-core_SSCI 2024-February-19.csv") %>%
                        mutate(index = "ssci"),
                      scie<-read_csv("data/wos-core_SCIE 2024-February-19.csv") %>%
                        mutate(index = "scie")) %>% 
   select(title = `Journal title`,
         issn = ISSN,
         eissn = eISSN,
         publisher =`Publisher name`,
         language = Languages,
         wos_category = `Web of Science Categories`,
         index) %>% 
  group_by(title, issn, eissn, publisher, language, wos_category) %>% 
  mutate(index, str_c(index, collapse = ",")) %>% 
  unique() %>% 
  rownames_to_column("id")

Journals

Code
wos_journals <- wos_data %>% 
  select(-index)

Indexes

Code
wos_journals_indexes <- wos_data %>%
  select(wos_id = id, index) %>% 
  separate_rows(index, sep=",")

ISSNs

Code
wos_journals_issns <- bind_rows(wos_data %>% 
                                  select(wos_id = id, issn) %>%
                                  mutate(type = "issn") %>%
                                  drop_na(),
                                wos_data %>%
                                  select(wos_id = id, issn = eissn) %>% 
                                  mutate(type = "eissn") %>%
                                  drop_na()) %>% 
  unique()

Openalex

Import OpenAlex journal data

Code
openalex_data <- read_csv("data/openalex_Feb_2024.csv")

OpenAlex journals

Code
openalex_journals <- openalex_data %>% 
  select(id,
         title = display_name,
         is_oa,
         is_in_doaj,
         type,
         country_code,
         publisher)

ISSNs

Code
openalex_issns <- bind_rows(openalex_data %>% 
                              select(openalex_id = id, issn = issn_l) %>% 
                              mutate(type = "issn_l") %>% 
                              drop_na(),
                            openalex_data %>% 
                              select(openalex_id = id, issn) %>% 
                              separate_rows(issn, sep = ",") %>%
                              mutate(type = "issn") %>% 
                              drop_na() %>% 
                              mutate(issn = str_remove_all(issn, "\"")) %>% 
                              mutate(issn = str_remove_all(issn, "\\[")) %>% 
                              mutate(issn = str_remove_all(issn, "\\]"))) %>% 
  unique()

Journal matching

DOAJ and WoS

Code
doaj_wos_issn_match <- doaj_issns %>% 
  inner_join(wos_journals_issns, by="issn") %>% 
  select(doaj_id, wos_id) %>% 
  mutate(matched_by = "issn")

doaj_wos_title_match <- doaj_titles %>% 
  filter(!doaj_id %in% doaj_wos_issn_match$doaj_id) %>%
  mutate(title = str_to_lower(title)) %>% 
  inner_join(wos_journals %>% 
               select(wos_id = id, title) %>% 
               mutate(title = str_to_lower(title)), by="title") %>% 
  select(doaj_id, wos_id) %>% 
  mutate(matched_by = "title")

doaj_wos_matches <- bind_rows(doaj_wos_issn_match, 
                              doaj_wos_title_match) %>% 
  unique()

DOAJ and Scopus

Code
doaj_scopus_issn_match <- doaj_issns %>% 
  inner_join(scopus_issns, by="issn") %>% 
  select(doaj_id, scopus_id) %>% 
  mutate(matched_by = "issn")

doaj_scopus_title_match <- doaj_titles %>% 
  filter(!doaj_id %in% doaj_scopus_issn_match$doaj_id) %>%
  select(doaj_id, title) %>% 
  mutate(title = str_to_lower(title)) %>% 
  inner_join(scopus_journals %>% 
               select(scopus_id = id, title) %>% 
               mutate(title = str_to_lower(title)), by="title") %>% 
  select(doaj_id, scopus_id) %>% 
  mutate(matched_by = "title")

doaj_scopus_matches <- bind_rows(doaj_scopus_issn_match, 
                              doaj_scopus_title_match) %>% 
  unique()

DOAJ and OpenAlex

Code
doaj_openalex_issn_match <- doaj_issns %>% 
  inner_join(openalex_issns, by="issn") %>% 
  select(doaj_id, openalex_id) %>% 
  mutate(matched_by = "issn")

doaj_openalex_title_match <- doaj_titles %>% 
  filter(!doaj_id %in% doaj_openalex_issn_match$doaj_id) %>%
  mutate(title = str_to_lower(title)) %>% 
  inner_join(openalex_journals %>% 
               select(openalex_id = id, title) %>% 
               mutate(title = str_to_lower(title)), by="title") %>% 
  select(doaj_id, openalex_id) %>% 
  mutate(matched_by = "title")

doaj_openalex_matches <- bind_rows(doaj_openalex_issn_match, 
                              doaj_openalex_title_match) %>% 
  unique()

Checks and manual matches

  1. Journals listed as in_doaj = T in Openalex but that were not matched in our process
  2. Journals listed as OA journals in the Scopus list that were not matched in our process.

OpenAlex

Code
library(kableExtra)
openalex_in_doaj <- openalex_data %>%
  filter(is_in_doaj == T)

openalex_data %>% 
  left_join(doaj_openalex_matches , by=c("id"="openalex_id")) %>% 
  mutate(matched = ifelse(is.na(doaj_id),0,1)) %>% 
  group_by(is_in_doaj) %>%
  mutate(total = n()) %>% 
  mutate(n_matched = sum(matched)) %>% 
  select(is_in_doaj, n_matched, total) %>% 
  unique() %>% 
  kbl()
is_in_doaj n_matched total
FALSE 1334 273680
TRUE 19476 19691
Code
openalex_data %>%
  filter(is_in_doaj == T) %>% 
  anti_join(doaj_openalex_matches, by = c("id"="openalex_id")) %>% 
  writexl::write_xlsx("data/openalex_journals_to_match.xslx")

dbReadTable(db, "doaj") %>% 
  left_join(dbReadTable(db, "doaj_issns"), by=c("id" = "doaj_id")) %>%
  group_by(id) %>% 
  mutate(issn = str_c(issn, collapse = ", ")) %>% 
  unique() %>% 
  ungroup() %>% 
  writexl::write_xlsx("data/doaj_for_matching.xlsx")

Add manual matches to data

Code
openalex_manual_matches <- read_xlsx("data/openalex_journals_manually_matched.xlsx")

doaj_openalex_matches <- bind_rows(doaj_openalex_matches,
                                   openalex_manual_matches %>% 
                                     filter(doaj_id != 0) %>%  
                                     select(doaj_id, openalex_id = id) %>% 
                                     mutate(matched_by = "manual"))

doaj_openalex_matches <- doaj_openalex_matches %>% 
  mutate(openalex_id = str_remove(openalex_id, "https://openalex.org/S"))

Scopus

Code
scopus_data %>% 
  left_join(doaj_scopus_matches, by = c("id" = "scopus_id")) %>% 
  mutate(matched = ifelse(!is.na(doaj_id),1,0)) %>% 
  group_by(oa_status) %>% 
  mutate(total = n()) %>% 
  mutate(n_matched = sum(matched)) %>% 
  select(oa_status, n_matched, total) %>% 
  unique() %>% 
  kbl() 
oa_status n_matched total
NA 972 38074
Unpaywall Open Acess 7230 7405
Code
scopus_data %>%
  filter(!is.na(oa_status)) %>% 
  anti_join(doaj_scopus_matches, by = c("id"="scopus_id")) %>% 
  writexl::write_xlsx("data/scopus_journals_to_match.xslx")

Add manual matches to data

Code
scopus_manual_matches <- read_xlsx("data/scopus_journals_manually_matched.xlsx")

doaj_scopus_matches <- bind_rows(doaj_scopus_matches,
                                   scopus_manual_matches %>% 
                                     filter(doaj_id != 0) %>%  
                                     select(doaj_id, scopus_id = id) %>% 
                                     mutate(matched_by = "manual"))

Discipline classification

Code
# doaj_issns <- dbReadTable(db, "doaj_issns")
# old_data <- dbReadTable(db, "data")
# 
# dbWriteTable(db, "journals_disciplines", 
#              doaj_issns %>%
#                filter(!doaj_id %in% dbReadTable(db,"journals_disciplines")$doaj_id) %>%
#                inner_join(old_data %>% filter(!is.na(issn)) %>% select(issn, nsf), by="issn") %>%
#                select(doaj_id, discipline = nsf) %>% 
#                separate_rows(discipline, sep = ";") %>% 
#                mutate(discipline = str_squish(discipline)) %>% 
#                unique() %>% 
#                drop_na(),
#              row.names = F,
#              append = T)
# 
# dbWriteTable(db, "journals_disciplines", 
#              doaj_issns %>%
#                filter(!doaj_id %in% dbReadTable(db,"journals_disciplines")$doaj_id) %>%
#                inner_join(old_data %>% filter(!is.na(eissn)) %>% select(issn = eissn, nsf), by="issn") %>%
#                select(doaj_id, discipline = nsf) %>% 
#                separate_rows(discipline, sep = ";") %>% 
#                mutate(discipline = str_squish(discipline)) %>% 
#                unique() %>% 
#                drop_na(),
#              row.names = F,
#              append = T)
# 
# dbWriteTable(db, "journals_disciplines", 
#              dbReadTable(db, "doaj_titles") %>%
#                filter(!doaj_id %in% dbReadTable(db,"journals_disciplines")$doaj_id) %>%
#                inner_join(old_data %>% filter(!is.na(journal_title)) %>% select(title = journal_title, nsf), by="title") %>%
#                select(doaj_id, discipline = nsf) %>% 
#                separate_rows(discipline, sep = ";") %>% 
#                mutate(discipline = str_squish(discipline)) %>% 
#                unique() %>% 
#                drop_na(),
#              row.names = F,
#              append = T)

Automatic classification based on previous classification

Code
# library(tidytext)
# library(textstem)
# subject_discipline <- dbReadTable(db, "doaj") %>% 
#   separate_rows(subject, sep = ",") %>% 
#   mutate(subject = str_squish(subject)) %>% 
#   mutate(subject = str_sub(subject,1,str_locate(subject,":")[,1]-1)) %>% 
#   select(id, subject) %>% 
#   inner_join(dbReadTable(db, "journals_disciplines"), by=c("id" = "doaj_id")) %>% 
#   group_by(subject, discipline) %>% 
#   summarize(n = n()) %>% 
#   ungroup() %>% 
#   group_by(subject) %>% 
#   mutate(total = sum(n)) %>% 
#   mutate(pct = n/total) %>% 
#   drop_na()
#   
# journal_discipline <- dbReadTable(db, "doaj") %>% 
#   filter(!id %in% dbReadTable(db, "journals_disciplines")$doaj_id) %>% 
#   separate_rows(subject, sep = ",") %>% 
#   mutate(subject = str_squish(subject)) %>% 
#   mutate(subject = str_sub(subject,1,str_locate(subject,":")[,1]-1)) %>% 
#   select(id, title, subject) %>%
#   inner_join(subject_discipline, by="subject") %>% 
#   group_by(id, discipline) %>% 
#   mutate(total_score = sum(pct)) %>% 
#   ungroup() %>% 
#   group_by(id) %>% 
#   mutate(best_match = ifelse(total_score == max(total_score),T,F)) %>%
#   filter(best_match == T) %>% 
#   select(doaj_id = id, discipline) %>% 
#   unique()

Create file with journals to manually classify

Code
# dbReadTable(db, "doaj") %>% 
#   filter(!id %in% dbReadTable(db, "journals_disciplines")$doaj_id) %>% 
#   writexl::write_xlsx("data/journals_to_classify.xlsx")

update data

Code
 # 
 # journals_disciplines <- dbReadTable(db, "journals_disciplines") %>% 
 #   bind_rows(read_xlsx("data/journals_to_classify.xlsx") %>% 
 #               select(doaj_id = id, discipline) %>% 
 #               unique())

Authorship data

Getting the authorship data from Google Big Query

Code
-- Query executed on March 8th 2024
/*
SELECT a.source_id,
       c.publication_year,
       d.institution_id,
       count(author_id) as n_publications
FROM `global-timer-415013.diamon_oa.openalex_sources`as a
JOIN insyspo.publicdb_openalex_2024_02_rm.works_primary_location as b
  on b.source_id = a.source_id
JOIN insyspo.publicdb_openalex_2024_02_rm.works as c
  on c.id = b.id
JOIN insyspo.publicdb_openalex_2024_02_rm.works_authorships as d
  on d.work_id = c.id
WHERE c.publication_year between 2019 and 2023 and c.type='article'
GROUP BY a.source_id, c.publication_year, d.institution_id"
*/

Import OpenAlex authorship data

Code
openalex_authorship <- read_csv("data/openalex_institutions_work_count_per_year.csv") %>% 
  mutate(source_id = str_remove(source_id, "https://openalex.org/S")) %>% 
  mutate(institution_id = str_remove(institution_id, "https://openalex.org/I"))

Import OpenAlex institution data

Code
openalex_institutions <- read_csv("data/openalex_institutions.csv") %>% 
  select(id, name = display_name, country_code, country) %>% 
  mutate(id = str_remove(id, "https://openalex.org/I"))

Analysis

Code
journals_disciplines <- read_csv("data/journals_disciplines.csv") %>% 
  bind_rows(read_xlsx("data/journals_to_classify.xlsx") %>% 
               select(doaj_id = id, discipline) %>% 
               unique())

doaj_coverage <- doaj_journals %>% 
  select(id) %>% 
  unique() %>% 
  mutate(coverage = case_when(
    id %in% doaj_scopus_matches$doaj_id ~ "WoS or Scopus",
    id %in% filter(doaj_wos_matches %>% 
                     inner_join(wos_journals_indexes, by="wos_id"),
                   index == "esci")$doaj_id ~ "WoS - ESCI",
    id %in% filter(doaj_wos_matches %>% 
                     inner_join(wos_journals_indexes, by="wos_id"),
                   index != "esci")$doaj_id ~ "WoS or Scopus",
    .default = "Not in WoS or Scopus"))

Coverage of DOAJ journals in OpenAlex, WoS and, Scopus

Overlap

Code
library("ggVennDiagram")
library("ggvenn")


plot_overlap_diamond <- list(
  OpenAlex = unique(doaj_journals %>% 
                      filter(oa_type == "Diamond") %>% 
                      inner_join(doaj_openalex_matches, 
                                 by=c("id" = "doaj_id")))$id,
  WoS = unique(doaj_journals %>% 
                      filter(oa_type == "Diamond") %>% 
                      inner_join(doaj_wos_matches, 
                                 by=c("id" = "doaj_id")))$id,
  Scopus = unique(doaj_journals %>% 
                      filter(oa_type == "Diamond") %>% 
                      inner_join(doaj_scopus_matches, 
                                 by=c("id" = "doaj_id")))$id
  ) %>% 
  ggvenn(stroke_size = 0.5) +
  ggtitle("Diamond OA") +
  theme(plot.title = element_text(size = 20, hjust = .5))


plot_overlap_gold <- list(
  OpenAlex = unique(doaj_journals %>% 
                      filter(oa_type == "Gold") %>% 
                      inner_join(doaj_openalex_matches, 
                                 by=c("id" = "doaj_id")))$id,
  WoS = unique(doaj_journals %>% 
                      filter(oa_type == "Gold") %>% 
                      inner_join(doaj_wos_matches, 
                                 by=c("id" = "doaj_id")))$id,
  Scopus = unique(doaj_journals %>% 
                      filter(oa_type == "Gold") %>% 
                      inner_join(doaj_scopus_matches, 
                                 by=c("id" = "doaj_id")))$id
  ) %>% 
  ggvenn(stroke_size = 0.5) +
  ggtitle("Gold OA") +
  theme(plot.title = element_text(size = 20, hjust = .5))

ggarrange(plot_overlap_diamond, plot_overlap_gold)

Code
ggsave("tables and figures/overlap.tiff", device = "tiff", width = 8, height = 6, units = "in", dpi=300)

Coverage

Code
plot_coverage <- doaj_journals %>%
  inner_join(doaj_coverage, by="id") %>% 
  inner_join(journals_disciplines, by=c("id"="doaj_id")) %>% 
  filter(discipline != 'n/a') %>% 
  mutate(group = str_c(oa_type, " - ", discipline)) %>% 
  unique() %>% 
  group_by(group) %>% 
  mutate(total = n()) %>% 
  group_by(group, coverage) %>% 
  mutate(n = n()) %>% 
  select(group, coverage, n, total) %>% 
  unique() %>% 
  mutate(p = n/total) %>% 
  ggplot() +
  aes(y=p, x=group, fill=coverage) +
  geom_bar(stat="identity") +
  ylab("Share of journals") +
  xlab("Domain and OA type") +
  theme_minimal() +
  theme(plot.title = element_text(size = 10, hjust = .5)) +
  theme(legend.title=element_blank()) +
  theme(legend.position = "top") + 
  scale_y_continuous(labels = scales::percent) +
  scale_fill_brewer(palette = "Dark2")

plot_coverage

Code
ggsave("tables and figures/coverage.tiff", device = "tiff", width = 8, height = 5, units = "in", dpi=300)

Publishers

Publisher region

Code
plot_publisher_region <- doaj_journals %>% 
  select(id, country_publisher_id, oa_type) %>% 
  unique() %>%
  inner_join(doaj_coverage, by="id") %>% 
  inner_join(doaj_countries_publishers, by=c("country_publisher_id"="id")) %>% 
  inner_join(countries, by="country_code") %>% 
  inner_join(journals_disciplines, by=c("id" = "doaj_id")) %>% 
  filter(discipline != "n/a") %>% 
  select(id, coverage, region, discipline, oa_type) %>%
  mutate(group = str_c(oa_type, " - ", discipline)) %>% 
  group_by(group) %>% 
  mutate(total = n()) %>%
  drop_na() %>% 
  group_by(group, region, coverage) %>% 
  reframe(n_journals = n(),
            p_journals = n_journals/total) %>% 
  unique() %>% 
  group_by(group, region) %>% 
  mutate(x=sum(p_journals)) %>% 
  ungroup() %>% 
  ggplot() +
  aes(x=p_journals, y=reorder(region, x), fill=coverage) +
  scale_x_continuous(labels = scales::percent, breaks=c(0,0.2,0.4,0.6,0.8,1)) +
  geom_bar(stat="identity") +
  ylab("Region") +
  xlab("Share of journals") +
  theme_minimal() +
  theme(panel.grid.minor = element_blank()) +
  theme(plot.title = element_text(size = 10, hjust = .5)) +
  theme(legend.title=element_blank()) +
  theme(legend.position = "top") +
  scale_fill_brewer(palette = "Dark2") +
  facet_wrap(facets = "group")


plot_publisher_region$data %>% 
  writexl::write_xlsx("tables and figures/publisher_region_data.xlsx")

plot_publisher_region

Code
ggsave("tables and figures/publisher_region.tiff", device = "tiff", width = 8, height = 5, units = "in", dpi=300)

Publisher income group

Code
plot_publisher_income_group <- doaj_journals %>% 
  select(id, country_publisher_id, oa_type) %>% 
  unique() %>%
  inner_join(doaj_coverage, by="id") %>% 
  inner_join(doaj_countries_publishers, by=c("country_publisher_id"="id")) %>% 
  inner_join(countries, by="country_code") %>% 
  inner_join(journals_disciplines, by=c("id" = "doaj_id")) %>% 
  filter(discipline != "n/a") %>% 
  select(id, coverage, income_group, discipline, oa_type) %>%
  mutate(group = str_c(oa_type, " - ", discipline)) %>% 
  group_by(group) %>% 
  mutate(total = n()) %>%
  drop_na() %>% 
  group_by(group, income_group, coverage) %>% 
  reframe(n_journals = n(),
            p_journals = n_journals/total) %>% 
  unique() %>% 
  group_by(group, income_group) %>% 
  mutate(x=sum(p_journals)) %>% 
  ungroup() %>%
  mutate(income_group = factor(income_group, 
                               levels=c("Low income", 
                                        "Lower middle income", 
                                        "Upper middle income",
                                        "High income"))) %>%

  ggplot() +
  aes(x=p_journals, y=income_group, fill=coverage) +
  scale_x_continuous(labels = scales::percent) +
  geom_bar(stat="identity") +
  ylab("Income group") +
  xlab("Share of journals") +
  theme_minimal() +
  theme(panel.grid.minor = element_blank()) +
  theme(plot.title = element_text(size = 10, hjust = .5)) +
  theme(legend.title=element_blank()) +
  theme(legend.position = "top") +
  scale_fill_brewer(palette = "Dark2") +
  facet_wrap(facets = "group")

plot_publisher_income_group

Code
plot_publisher_income_group$data %>% 
  writexl::write_xlsx("tables and figures/publisher_income_group_data.xlsx")

ggsave("tables and figures/publisher_income_group.tiff", device = "tiff", width = 8, height = 5, units = "in", dpi=300)

Publishing language

Here we divide journals in three groups:

  1. English journals only publish in English
  2. Multiple languages journals publish in English and at least one more language.
  3. Non-English journals do not publish in English.
Code
n_lang <- doaj_languages %>% 
  group_by(doaj_id) %>%
  summarize(n_lang = n())

journal_language<-doaj_languages %>% 
  filter(language == "English") %>% 
  filter(doaj_id %in% filter(n_lang, n_lang == 1)$doaj_id) %>% 
  select(doaj_id) %>% 
  mutate(language = "English") %>% 
  unique()

# Classify journals in one of the three language groups
journal_language<-bind_rows(journal_language,
                            doaj_languages %>% 
                              filter(language == "English") %>% 
                              filter(doaj_id %in% filter(n_lang, n_lang > 1)$doaj_id) %>% 
                            select(doaj_id) %>% 
                            mutate(language = "Multiple Languages") %>% 
                            unique())

journal_language<-bind_rows(journal_language,
                          doaj_languages %>% 
                            filter(!doaj_id %in% journal_language$doaj_id) %>% 
                            select(doaj_id) %>% 
                            mutate(language = "Non-English") %>% 
                            unique())
                            

plot_publishing_language <- doaj_journals %>% 
  select(id, oa_type) %>% 
  unique() %>% 
  inner_join(doaj_coverage, by="id") %>% 
  inner_join(journal_language, by=c("id"="doaj_id")) %>% 
  inner_join(journals_disciplines, by=c("id"="doaj_id")) %>% 
  filter(discipline != "n/a") %>% 
  select(id, coverage, language, discipline, oa_type) %>%
  mutate(group = str_c(oa_type, " - ", discipline)) %>% 
  unique() %>% 
  group_by(group) %>% 
  mutate(total = n()) %>%
  ungroup() %>% 
  drop_na() %>% 
  group_by(group, language, coverage) %>% 
  reframe(n_journals = n(),
            p_journals = n_journals/total) %>% 
  ungroup() %>% 
  unique() %>% 
  group_by(language) %>% 
  mutate(total_language=sum(n_journals)) %>% 
  arrange(desc(total_language)) %>% 
  ungroup() %>% 
  ggplot() +
  aes(x=p_journals, y=language, fill=coverage) +
  scale_x_continuous(labels = scales::percent) +
  geom_bar(stat="identity") +
  ylab("Language") +
  xlab("Share of journals") +
  theme_minimal() +
  theme(panel.grid.minor = element_blank()) +
  theme(plot.title = element_text(size = 10, hjust = .5)) +
  theme(legend.title=element_blank()) +
  theme(legend.position = "top") +
  scale_fill_brewer(palette = "Dark2") +
  facet_wrap(facets = "group")

plot_publishing_language

Code
plot_publishing_language$data %>% 
  writexl::write_xlsx("tables and figures/publisher_language_data.xlsx")

ggsave("tables and figures/publishing_language.tiff", device = "tiff", width = 8, height = 5, units = "in", dpi=300)

Publisher countries (Gini)

Code
table_publisher_countries_gini <- 
  data <- doaj_journals %>% 
  select(id, country_publisher_id, oa_type) %>% 
  unique() %>% 
  inner_join(doaj_coverage, by="id") %>% 
  inner_join(doaj_countries_publishers, by=c("country_publisher_id"="id")) %>% 
  inner_join(journals_disciplines, by=c("id"="doaj_id")) %>% 
  filter(discipline != "n/a")


write_xlsx(table_publisher_countries_gini %>% 
             group_by(discipline, oa_type, coverage, country_code) %>% 
             reframe(n_journals = n()),
           "tables and figures/data_journals_by_country.xlsx")


# Creates a combination of every journal and country code
x<-expand(countries,
          select(table_publisher_countries_gini, 
                 id, coverage, discipline, oa_type),
          country_code) %>% 
  unique()

table_publisher_countries_gini <-x %>% 
  left_join(table_publisher_countries_gini %>%
              select(id, country_code) %>% 
              mutate(n = 1), 
            by=c("id", "country_code")) %>% 
  mutate(n = ifelse(is.na(n),0,1)) %>% 
  unique() %>% 
  group_by(coverage, oa_type, discipline, country_code) %>% 
  reframe(n = sum(n)) %>% 
  group_by(coverage, oa_type, discipline) %>% 
  summarize(gini = ineq(n,type="Gini")) %>% 
  pivot_wider(names_from = "coverage", values_from = "gini") %>% 
  drop_na()

table_publisher_countries_gini %>% 
  kbl()
oa_type discipline Not in WoS or Scopus WoS - ESCI WoS or Scopus
Diamond BM 0.9081304 0.9382768 0.9181212
Diamond NSE 0.9039658 0.9169850 0.8948191
Diamond SSH 0.9086776 0.9378867 0.9055282
Gold BM 0.9461987 0.9705903 0.9615593
Gold NSE 0.9524853 0.9605911 0.9496604
Gold SSH 0.9591767 0.9583750 0.9440742

Authorship

Author region

Code
plot_authorship_region <- doaj_journals %>% 
  inner_join(doaj_openalex_matches, by=c("id"="doaj_id")) %>% 
  left_join(openalex_authorship, by=c("openalex_id"="source_id")) %>% 
  left_join(openalex_institutions, by=c("institution_id" = "id")) %>% 
  left_join(countries, by="country_code") %>% 
  inner_join(doaj_coverage, by="id") %>% 
  inner_join(journals_disciplines, by=c("id" = "doaj_id")) %>% 
  filter(discipline != "n/a") %>% 
  mutate(group = str_c(oa_type, " - ", discipline)) %>%
  group_by(group) %>% 
  drop_na() %>% 
  mutate(total = sum(n_publications)) %>%
  group_by(group, region, coverage) %>% 
  reframe(n_authorships = sum(n_publications),
            p_authorships = n_authorships/total) %>% 
  unique() %>% 
  group_by(group, region) %>% 
  mutate(x=sum(p_authorships)) %>% 
  ungroup() %>% 
  ggplot() +
  aes(x=p_authorships, y=reorder(region, x), fill=coverage) +
  scale_x_continuous(labels = scales::percent, breaks=c(0,0.2,0.4,0.6,0.8,1)) +
  geom_bar(stat="identity") +
  ylab("Region") +
  xlab("Share of authorships") +
  theme_minimal() +
  theme(panel.grid.minor = element_blank()) +
  theme(plot.title = element_text(size = 10, hjust = .5)) +
  theme(legend.title=element_blank()) +
  theme(legend.position = "top") +
  scale_fill_brewer(palette = "Dark2") +
  facet_wrap(facets = "group")

plot_authorship_region

Code
plot_publisher_region$data %>% 
  writexl::write_xlsx("tables and figures/authorships_region_data.xlsx")


ggsave("tables and figures/authorship_region.tiff", device = "tiff", width = 8, height = 5, units = "in", dpi=300)

Author income group

Code
plot_authorship_income_group <- doaj_journals %>% 
  inner_join(doaj_openalex_matches, by=c("id"="doaj_id")) %>% 
  left_join(openalex_authorship, by=c("openalex_id"="source_id")) %>% 
  left_join(openalex_institutions, by=c("institution_id" = "id")) %>% 
  left_join(countries, by="country_code") %>% 
  inner_join(doaj_coverage, by="id") %>% 
  inner_join(journals_disciplines, by=c("id" = "doaj_id")) %>% 
  filter(discipline != "n/a") %>% 
  mutate(group = str_c(oa_type, " - ", discipline)) %>%
  group_by(group) %>% 
  drop_na() %>% 
  mutate(total = sum(n_publications)) %>%
  group_by(group, income_group, coverage) %>% 
  reframe(n_authorships = sum(n_publications),
            p_authorships = n_authorships/total) %>% 
  unique() %>% 
  group_by(group, income_group) %>% 
  mutate(x=sum(p_authorships)) %>% 
  ungroup() %>% 
  ggplot() +
  aes(x=p_authorships, y=reorder(income_group, x), fill=coverage) +
  scale_x_continuous(labels = scales::percent, breaks=c(0,0.2,0.4,0.6,0.8,1)) +
  geom_bar(stat="identity") +
  ylab("Income group") +
  xlab("Share of authorships") +
  theme_minimal() +
  theme(panel.grid.minor = element_blank()) +
  theme(plot.title = element_text(size = 10, hjust = .5)) +
  theme(legend.title=element_blank()) +
  theme(legend.position = "top") +
  scale_fill_brewer(palette = "Dark2") +
  facet_wrap(facets = "group")

plot_authorship_income_group$data %>% 
  writexl::write_xlsx("tables and figures/authorships_income_group_data.xlsx")

plot_authorship_income_group

Code
ggsave("tables and figures/authorship_income_group.tiff", device = "tiff", width = 8, height = 5, units = "in", dpi=300)

Author countries (Gini)

Code
countries_in_data <- openalex_authorship %>% 
  inner_join(openalex_institutions, by=c("institution_id"="id")) %>% 
  select(country_code) %>% 
  drop_na() %>% 
  unique()

authorship_gini_plot <- doaj_journals %>% 
  inner_join(doaj_openalex_matches, by=c("id"="doaj_id")) %>% 
  left_join(openalex_authorship, by=c("openalex_id"="source_id")) %>% 
  left_join(openalex_institutions, by=c("institution_id" = "id")) %>% 
  left_join(journals_disciplines, by=c("id"="doaj_id")) %>% 
  left_join(doaj_coverage, by="id") %>% 
  select(id, discipline, oa_type, coverage, country_code, n_publications) %>% 
  group_by(id, discipline, oa_type, coverage, country_code) %>% 
  reframe(n = sum(n_publications))

write_xlsx(authorship_gini_plot %>% 
             group_by(discipline, oa_type, coverage, country_code) %>% 
             reframe(total_authorship = sum(n)),
           "tables and figures/data_authorships_by_country.xlsx")

x<-expand(countries,
          doaj_journals %>% select(id),
          country_code) %>% 
  inner_join(countries_in_data, by="country_code")

authorship_gini_plot <-x %>% 
  left_join(authorship_gini_plot, by=c("id","country_code")) %>% 
  mutate(n = ifelse(is.na(n),0,n)) %>% 
  group_by(id) %>% 
  mutate(gini = ineq(n,type="Gini")) %>% 
  drop_na() %>% 
  filter(discipline != "n/a") %>% 
  mutate(discipline = as.factor(discipline),
         coverage = as.factor(coverage),
         oa_type = as.factor(oa_type)) %>% 
  unique()

authorship_gini_plot %>%
  select(id, coverage, oa_type, discipline, gini) %>% 
  filter(discipline != 'n/a') %>% 
  unique() %>% 
  mutate(coverage = as.character(coverage)) %>% 
  mutate(coverage = ifelse(coverage == "Not in WoS or Scopus",
                           "OpenAlex only",
                           coverage)) %>% 
  ggplot() +
  aes(x=coverage, y = gini, fill = oa_type) +
  geom_boxplot() +
  facet_wrap(facets = "discipline") +
  ylab("Gini coefficient") +
  xlab("coverage") +
  theme_minimal() +
  theme(legend.position = "top") +
  scale_fill_brewer(palette = "Dark2") +
  theme(axis.text.x = element_text(angle = 45)) + 
  theme(axis.text.x = element_text(margin = margin(t = 20))) +
  theme(legend.title = element_blank()) +
  theme(axis.title.x = element_text(margin = margin(t = -15))) 

Code
ggsave("tables and figures/authorship_gini.tiff", device = "tiff", width = 8, height = 5, units = "in", dpi=300)

Kruskall-Wallis test

Code
authorship_gini_plot<-authorship_gini_plot %>%
  ungroup() %>% 
  mutate(group = str_c(oa_type, " - ", coverage)) %>% 
  mutate(group = as.factor(group))

authorship_gini_plot %>% 
  group_by(discipline) %>% 
  kruskal_test(gini ~ group) %>% 
  select(discipline, n, statistic, df, p) %>% 
  inner_join(authorship_gini_plot %>%
               group_by(discipline) %>% 
               kruskal_effsize(gini ~ group) %>% 
               select(discipline, effsize, magnitude),
             by="discipline") %>% 
  kbl() 
discipline n statistic df p effsize magnitude
BM 140356 34698.79 5 0 0.2471948 large
NSE 127369 27503.77 5 0 0.2159086 large
SSH 99600 35145.65 5 0 0.3528391 large

Pairwise Wilcoxon test

Code
authorship_gini_plot %>% 
  filter(discipline == "BM") %>% 
  wilcox_test(gini ~ group, p.adjust.method = "bonferroni") %>% 
  select(-1) %>% 
  kbl(caption = "Pairwise Wilcoxon test for BM journals")
Pairwise Wilcoxon test for BM journals
group1 group2 n1 n2 statistic p p.adj p.adj.signif
Diamond - Not in WoS or Scopus Diamond - WoS - ESCI 10551 1876 13696012 0 0 ****
Diamond - Not in WoS or Scopus Diamond - WoS or Scopus 10551 21620 165080846 0 0 ****
Diamond - Not in WoS or Scopus Gold - Not in WoS or Scopus 10551 8705 67648676 0 0 ****
Diamond - Not in WoS or Scopus Gold - WoS - ESCI 10551 3683 33416252 0 0 ****
Diamond - Not in WoS or Scopus Gold - WoS or Scopus 10551 93921 899097033 0 0 ****
Diamond - WoS - ESCI Diamond - WoS or Scopus 1876 21620 22568556 0 0 ****
Diamond - WoS - ESCI Gold - Not in WoS or Scopus 1876 8705 10267316 0 0 ****
Diamond - WoS - ESCI Gold - WoS - ESCI 1876 3683 5391716 0 0 ****
Diamond - WoS - ESCI Gold - WoS or Scopus 1876 93921 149528659 0 0 ****
Diamond - WoS or Scopus Gold - Not in WoS or Scopus 21620 8705 107084083 0 0 ****
Diamond - WoS or Scopus Gold - WoS - ESCI 21620 3683 56381925 0 0 ****
Diamond - WoS or Scopus Gold - WoS or Scopus 21620 93921 1588441118 0 0 ****
Gold - Not in WoS or Scopus Gold - WoS - ESCI 8705 3683 19549852 0 0 ****
Gold - Not in WoS or Scopus Gold - WoS or Scopus 8705 93921 562692469 0 0 ****
Gold - WoS - ESCI Gold - WoS or Scopus 3683 93921 206964867 0 0 ****
Code
authorship_gini_plot %>% 
  filter(discipline == "NSE") %>% 
  wilcox_test(gini ~ group, p.adjust.method = "bonferroni") %>% 
  select(-1) %>% 
  kbl(caption = "Pairwise Wilcoxon test for NSE journals")
Pairwise Wilcoxon test for NSE journals
group1 group2 n1 n2 statistic p p.adj p.adj.signif
Diamond - Not in WoS or Scopus Diamond - WoS - ESCI 11329 1623 10518550 0 0 ****
Diamond - Not in WoS or Scopus Diamond - WoS or Scopus 11329 25271 224415720 0 0 ****
Diamond - Not in WoS or Scopus Gold - Not in WoS or Scopus 11329 9673 73156915 0 0 ****
Diamond - Not in WoS or Scopus Gold - WoS - ESCI 11329 1565 14653520 0 0 ****
Diamond - Not in WoS or Scopus Gold - WoS or Scopus 11329 77908 779928144 0 0 ****
Diamond - WoS - ESCI Diamond - WoS or Scopus 1623 25271 32241307 0 0 ****
Diamond - WoS - ESCI Gold - Not in WoS or Scopus 1623 9673 10300465 0 0 ****
Diamond - WoS - ESCI Gold - WoS - ESCI 1623 1565 2128887 0 0 ****
Diamond - WoS - ESCI Gold - WoS or Scopus 1623 77908 114633754 0 0 ****
Diamond - WoS or Scopus Gold - Not in WoS or Scopus 25271 9673 104319919 0 0 ****
Diamond - WoS or Scopus Gold - WoS - ESCI 25271 1565 22707667 0 0 ****
Diamond - WoS or Scopus Gold - WoS or Scopus 25271 77908 1349522587 0 0 ****
Gold - Not in WoS or Scopus Gold - WoS - ESCI 9673 1565 9622416 0 0 ****
Gold - Not in WoS or Scopus Gold - WoS or Scopus 9673 77908 549418991 0 0 ****
Gold - WoS - ESCI Gold - WoS or Scopus 1565 77908 75215917 0 0 ****
Code
authorship_gini_plot %>% 
  filter(discipline == "SSH") %>% 
  wilcox_test(gini ~ group, p.adjust.method = "bonferroni") %>%
  select(-1) %>% 
  kbl(caption = "Pairwise Wilcoxon test for SSH journals")
Pairwise Wilcoxon test for SSH journals
group1 group2 n1 n2 statistic p p.adj p.adj.signif
Diamond - Not in WoS or Scopus Diamond - WoS - ESCI 36878 6617 153026523 0.000 0 ****
Diamond - Not in WoS or Scopus Diamond - WoS or Scopus 36878 29555 831561731 0.000 0 ****
Diamond - Not in WoS or Scopus Gold - Not in WoS or Scopus 36878 8390 156609211 0.078 1 ns
Diamond - Not in WoS or Scopus Gold - WoS - ESCI 36878 1478 44793631 0.000 0 ****
Diamond - Not in WoS or Scopus Gold - WoS or Scopus 36878 16682 570289852 0.000 0 ****
Diamond - WoS - ESCI Diamond - WoS or Scopus 6617 29555 131037690 0.000 0 ****
Diamond - WoS - ESCI Gold - Not in WoS or Scopus 6617 8390 22912151 0.000 0 ****
Diamond - WoS - ESCI Gold - WoS - ESCI 6617 1478 7386899 0.000 0 ****
Diamond - WoS - ESCI Gold - WoS or Scopus 6617 16682 99053391 0.000 0 ****
Diamond - WoS or Scopus Gold - Not in WoS or Scopus 29555 8390 78124342 0.000 0 ****
Diamond - WoS or Scopus Gold - WoS - ESCI 29555 1478 27412302 0.000 0 ****
Diamond - WoS or Scopus Gold - WoS or Scopus 29555 16682 398847420 0.000 0 ****
Gold - Not in WoS or Scopus Gold - WoS - ESCI 8390 1478 9274796 0.000 0 ****
Gold - Not in WoS or Scopus Gold - WoS or Scopus 8390 16682 121138229 0.000 0 ****
Gold - WoS - ESCI Gold - WoS or Scopus 1478 16682 17471720 0.000 0 ****

References