Exercise 10 | Color and Accessibility

Max Pellert (https://mpellert.at)

IS 616: Large Scale Data Analysis and Visualization

Let’s check out some example code from the first hand-in assignment

library("tidyverse")
library("scales")
library("gridExtra")
library("fBasics")
library("xtable")

age_gaps <- read.csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-14/age_gaps.csv")

movies <- read.csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-03-09/movies.csv")

bechdel_ratings <- read.csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-03-09/raw_bechdel.csv")

# Create a count of observations for each pair of release_year and age_difference values
age_gaps_count <- age_gaps %>%
count(release_year, age_difference, name = "count") %>%
ungroup() # not perfectly sure about this one actually, but it works :D

# I think it’s used to make sure every single observation is assigned a value
# Add the count variable to the age_gaps dataset
age_gaps <- age_gaps %>%
left_join(age_gaps_count, by = c("release_year", "age_difference"))

# Plot age gaps against release years
Age_Gaps_in_Romantic_Movies_by_Year <- ggplot(age_gaps, aes(x = release_year,
y = age_difference)) +
geom_point(aes(size = count), color = "dodgerblue") + # create scatterplot
labs(x = "Release Year", y = "Age Gap") +
ggtitle("Age Gaps in Romantic Movies by Year") +
theme_minimal() +
geom_smooth(method = "lm", color = "orange") # include regression line

# Save the plot as a pdf vector graphic
# ggsave("Age_Gaps_in_Romantic_Movies_by_Year.pdf", plot = Age_Gaps_in_Romantic_Movies_by_Year, device = "pdf")

Age_Gaps_in_Romantic_Movies_by_Year

# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# load movies table
movies = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-03-09/movies.csv')
# select relevant columns
movies = movies[['year','clean_test']]
# rename test column
movies = movies.rename(columns={'clean_test':'bechdel_test'})
# movies.head()

# bins that represent the years = x axis
bins = np.arange(1969,2014,1)

# Bechdel classes and their label and color used in graphic
bechdel_list = [
('dubious','Dubious','slategray'),
('nowomen','No two women','darkred'),
('notalk',"Women don't talk",'indianred'),
('men','Women talk only about men','lightcoral'),
('ok','Bechdel test passed','cornflowerblue')
]

# create subplots
fig, axs = plt.subplots(5, 1, sharex=True, sharey=True, figsize=(11.69,8.27),gridspec_kw={'wspace': 0.1, 'hspace': 0.1}) # a4 figsize

# set title and subtitle of figure
fig.suptitle('Evolution of representation of women in movies', fontsize=20, color='black')
fig.text(0.5, 0.94, 'Comparison of the Bechdel test classes over time',fontsize=15, ha='center', va='top', color='dimgray')

# set background color of figure
fig.set_facecolor(color='gainsboro')

# common axes labels
# add a big axis, hide frame
fig.add_subplot(111, frameon=False)

# hide tick and tick label of the big axes
plt.tick_params(labelcolor='none', top=False, bottom=False, left=False,right=False)

# set axes names
plt.xlabel('Year', color='dimgray', fontsize='x-large')
plt.ylabel('Number of movies', color='dimgray', fontsize='x-large')

# set axis limits and values for first axis (axes are shared so it influences all axes)
axs[0].set(
xlim=(1970, 2013), ylim=(0, 80),
xticks=range(1970,2020,10), yticks=(0, 50)
)

# iterate over all axes and bechdel classes stated before
for ax, (bechdel_result, label, color) in zip(axs.flat, bechdel_list):
  
  # calculate the count of movies in that class per year
  bechdel_counts = movies[movies['bechdel_test']==bechdel_result]['year'].value_counts(sort=False, bins=bins)
  # plot the data
  ax.fill_between(bins[1:], bechdel_counts, color=color,edgecolor='lightgray')
  # change font family of x and y axis
  for tick in ax.get_xticklabels():
    tick.set_fontproperties('Courier New')
  for tick in ax.get_yticklabels():
    tick.set_fontproperties('Courier New')
  # adjust label size and color
  ax.tick_params(axis='both', direction='inout', labelsize='medium',labelcolor='dimgray')
  # show label of bechdel result group
  ax.text(0.02, 0.9, label, fontsize='x-large',transform=ax.transAxes, va='top', color=color)
  
# save figure as pdf in local directory
# plt.savefig('graphic.pdf', format='pdf')

# show the graphic
plt.show()

For the second hand-in assignment…

Voeten, E., Strezhnev, A., & Bailey, M. (2009). United Nations General Assembly Voting Data [dataset]. Harvard Dataverse. https://doi.org/10.7910/DVN/LEJUQZ

Voeten, E. (2012). Data and Analyses of Voting in the UN General Assembly. SSRN Electronic Journal. https://doi.org/10.2139/ssrn.2111149

unvotes.csv

variable class description
rcid double The roll call id; used to join with un_votes and un_roll_call_issues
country character Country name, by official English short name
country_code character 2-character ISO country code
vote integer Vote result as a factor of yes/abstain/no

roll_calls.csv

variable class description
rcid integer .
session double Session number. The UN holds one session per year; these started in 1946
importantvote integer Whether the vote was classified as important by the U.S. State Department report “Voting Practices in the United Nations”. These classifications began with session 39
date double Date of the vote, as a Date vector

roll_calls.csv

variable class description
unres character Resolution code
amend integer Whether the vote was on an amendment; coded only until 1985
para integer Whether the vote was only on a paragraph and not a resolution; coded only until 1985
short character Short description
descr character Longer description

issues.csv

variable class description
rcid integer The roll call id; used to join with unvotes and un_roll_calls
short_name character Two-letter issue codes
issue integer Descriptive issue name

In R

# Get the Data

# Read in with tidytuesdayR package 
# Install from CRAN via: install.packages("tidytuesdayR")
# This loads the readme and all the datasets for the week of interest

# Either ISO-8601 date or year/week works!

tuesdata <- tidytuesdayR::tt_load('2021-03-23')
tuesdata <- tidytuesdayR::tt_load(2021, week = 13)

unvotes <- tuesdata$unvotes

# Or read in the data manually

unvotes <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-03-23/unvotes.csv')
roll_calls <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-03-23/roll_calls.csv')
issues <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-03-23/issues.csv')

In Python

import pandas as pd

unvotes = pd.read_csv(
  'https://raw.githubusercontent.com/rfordatascience/'\
  'tidytuesday/master/data/2021/2021-03-23/unvotes.csv')
roll_calls = pd.read_csv(
  'https://raw.githubusercontent.com/rfordatascience/'\
  'tidytuesday/master/data/2021/2021-03-23/roll_calls.csv')
issues = pd.read_csv(
  'https://raw.githubusercontent.com/rfordatascience/'\
  'tidytuesday/master/data/2021/2021-03-23/issues.csv')

Until next time…

Take a look at the United Nations data

Load it, do explorations to see if you run into any issues

After next unit, you will know more about geospatial data visualization and you should use that knowledge with the UN data for your second hand-in assignment