import numpy as np
import pandas as pd
from functools import partial
from pylab import *
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline
def ttest_group(data, var, by=None, test='ind', equal_var=False):
"""Runs a t-test on each combination of values for the by-variable
data: data to do test on
var: either a variable name, if by is specified, or an array
by: variable to construct groups from
test: type of test to run: 'ind' - ttest_ind, 'rel' - ttest_rel, 'wilcoxon' - wilcoxon
equal_var: whether to assume equal variance (otherwise use Welch's)
"""
if by != None:
g = data.groupby(by)[var]
keys = g.groups.keys()
res = pd.DataFrame(data=np.empty(shape=(len(keys), 2*len(keys))),
index=g.groups.keys(),
columns=pd.MultiIndex.from_product([['t stat.','p-value'], keys], names=[var, by]),
dtype='object')
else:
keys = var
res = pd.DataFrame(data=np.empty(shape=(len(keys), 2*len(keys))),
index=keys,
columns=pd.MultiIndex.from_product([['t stat.','p-value'], keys]),
dtype='object')
#res.index.name = by
#res.columns.name = by
for key1 in keys:
for key2 in keys:
g1 = g.get_group(key1) if by != None else data.loc[(data[key1].notnull()) & (data[key2].notnull()), key1]
g2 = g.get_group(key2) if by != None else data.loc[(data[key1].notnull()) & (data[key2].notnull()), key2]
if key1 == key2: # can't do t-test of a variable on itself
res.loc[key1, ('t stat.', key2)] = "x"
res.loc[key1, ('p-value', key2)] = "x"
else:
if test=='ind':
res.loc[key1, ('t stat.', key2)] = "%0.2f" % stats.ttest_ind(g1,
g2,
equal_var=equal_var)[0]
res.loc[key1, ('p-value', key2)] = "%0.3f" % stats.ttest_ind(g1,
g2,
equal_var=equal_var)[1]
elif test=='rel': # relative
res.loc[key1, ('t stat.', key2)] = "%0.2f" % stats.ttest_rel(g1,
g2)[0]
res.loc[key1, ('p-value', key2)] = "%0.3f" % stats.ttest_rel(g1,
g2)[1]
elif test=='wilcoxon':
res.loc[key1, ('t stat.', key2)] = "%0.2f" % stats.wilcoxon(g1,
g2)[0]
res.loc[key1, ('p-value', key2)] = "%0.3f" % stats.wilcoxon(g1,
g2)[1]
return res
import warnings
def desc(data,
float_decimals=3,
stats=['count','mean','std','min','percentiles','max'], percentiles=[25,50,75]):
"""Get a DataFrame with statistical aggregations of a DataFrame
Basically a version of pd.DataFrame.describe() that allows you to customize
the statistics by specifying functions to apply to each column.
"""
# Create empty columns list
cs = []
# Calculate each of the statistics
for stat in stats:
if isinstance(stat, str):
if stat == 'count':
cs.append(pd.DataFrame(data.count(), columns=['count']))
elif stat == 'mean':
cs.append(pd.DataFrame(data.mean(), columns=['mean']))
elif stat == 'std':
cs.append(pd.DataFrame(data.std(), columns=['std']))
elif stat == 'min':
cs.append(pd.DataFrame(data.min(), columns=['min']))
elif stat == 'max':
cs.append(pd.DataFrame(data.max(), columns=['max']))
elif stat == 'percentiles' or stat == 'pct':
pctiles = data.dropna().apply(partial(np.percentile, q=percentiles))
cs.append(pd.DataFrame.from_records(pctiles.tolist(),
index = pctiles.index,
columns = map(lambda x: '%d%%' % x, percentiles))) # add "%" to end of each element
else:
warnings.warn("No known procedure for statistic '%s'; "
"only 'count', 'mean', 'std', 'min', 'percentiles' or 'pct', and 'max' are supported" % stat)
else:
cs.append(data.apply(stat))
# Concat all the stat DataFrames
d = pd.concat(cs, axis=1)
# Change float columns to formatted strings (to reduce number of decimals according to float_decimals)
for c in d.loc[:, d.dtypes == np.float].columns:
d[c] = d[c].apply(lambda x: ('{:0.%df}' % float_decimals).format(x))
return d
What defines a good or effective article on Wikipedia?
But to ask that question, we must also ask: What is the purpose of a Wikipedia article? But again, this question forces another: What do people go to Wikipedia for? For, if we know why users use Wikipedia, it makes sense to then say that the purpose of its articles is to satisfy the needs of the users, and thus an effective article is one which satisfies the needs of those who visit it. (It should be noted, explicitly, of course, that I am here considering the users to primarily be those who read Wikipedia, rather than those who edit it—though of course editors are also users in this sense, just a small proportion of them; if Wikipedia is conceived of as primarily a community of editors, a sort of virtual hobby whose purpose is to satisfy the hobbyists, then the above questions become much less significant and, I believe, so does Wikipedia itself.)
I argue that Wikipedia users go to Wikipedia for reasons that, often, could be roughly broken down into 3 categories:
To learn: a user is curious about a concept, or they are researching an assignment, or they have come across something they don't understand. These users want a brief description of what they are looking for, a sort of long-form definition. Often, such users are not experts in the article's broader field, and thus are looking for a description that is understandable for the layperson:
To find information: a user is looking for a particular fact or detail—Who is so-and-so married to? When did so-and-so die? When was the Battle of Such-and-such? This of course could be seen as a subset of #1 (and in fact so could all uses), but I distinguish it because, in this case, the user knows what he or she wants to find. They know that So-and-so is a public figure, they just don't know who So-and-so's spouse is. These users often don't need to read the articles themselves: much of the factual information is summarized in the infoboxes. What is the electronegativity of platinum? Note that such a user would never search for that information if they did not already know:
To remember what has been forgotten: a user is refreshing their understanding of a subject. Again, this could be a subset of both #1 and #2, but I distinguish it because, unlike #1, the user is already quite familiar with the material, and unlike #2, the user is looking for something that is not a clear-cut fact which could be listed in an infobox. For example: When do I use a one-tailed t-test vs. a two-tailed one? Where was the epigraph to The Waste Land from? How many of Wittgenstein's brothers committed suicide? What was the precipitating event of the Opium Wars? This user will make use of the section headings and perhaps look for a certain phrase in the page, skimming until the information is found. Again, this user knows a fair amount about the subject, as well as how the subject relates to the broader context.
Though all related, these use cases for Wikipedia are, I think, quite distinct, insofar as each will be looking in different parts of the article (e.g. #1 will read the introduction more than #2 or #3 will, #2 will consult the infobox primarily, while the others may ignore it, and #3 may pay close attention to the contents to find the relevant section). Each will also judge the article differently. #1 will likely value clarity of writing, as well as comprehensiveness, but may not have the background to really judge how objective or trustworth an article is (though of course they will notice the extent to which citations are used, etc.). #2 may not read any of the writing, but will notice if key facts are left out. #3 will rely extensively on the article being well organized as well as complete and comprehensive.
Which leads me to: How can (should) we interpret readers' subjective ratings of Wikipedia articles? Can the overall quality of an article be measured by looking at the mean of various readers' ratings? Is there even such a thing as overall quality in this context?
df = pd.read_csv('cleaned_wiki_ratings.csv')
df.info()
r = df.groupby('page_title').page_rate_count.max()
r.describe()
r.sort(ascending=False)
r.head(10)
Ha, the Hunger Games! Not very surprising. Some of the others are interesting, though! (Knights Templar?!)
NB: Of course, this is a sample, so this does not really say much about the world itself.
Let's look at a plot of the distribution:
fig = plt.figure()
r.sort() # sort ascending and take the tail so that the top bar on the plot is the highest of the group
ax = r.tail(20).plot(
kind='barh',
title='Most Rated Pages (in sample of 10,000 pages)',
figsize=(10, 8))
ax.set_xlabel('Number of Ratings')
And let's look at the overall distribution.
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18,5))
for i, section, title in [(0, r, 'All Pages'),
(1, r.iloc[-100:], 'Pages w/ Most Ratings'),
(2, r.iloc[0:-100], 'Pages w/ Fewest Ratings')] :
section.hist(ax=axes[i],
normed=True,
bins=math.sqrt(section.count()),
color='lightskyblue',
alpha=0.7)
section.plot(ax=axes[i],
kind='kde',
xlim=(section.min(), section.max()),
title="%s (N=%d)" % (title, section.count()))
axes[i].set_xlabel("Number of Ratings of Page")
fig.suptitle("Distribution of Ratings per Page (Sample of 10,000 Pages)")
#fig.tight_layout()
The right-tailed distribution is hardly surprising, though the drop-off is quite sharp, as the most-rated pages tend to be rated many times more than the next-most-rated.
More versions means that we have more rated edits, and thus can trace the article quality over a greater history.
NB: I distinguish between "versions" and "rated versions" (or in some cases "versions rated") because I am using data from the ratings dataset only, at this stage. Any versions of the page that don't have any ratings are thus invisible in this dataset; there is no way, for example, to look at the number of edits made to a page, only the number of edits that have been rated.
v = df.groupby('page_title').rev_id.unique().apply(lambda x: x.size)
v.describe()
v.sort(ascending=False)
v.head(10)
Some clear similarities with 1a.
fig = plt.figure()
v.sort()
ax = v.tail(20).plot(
kind='barh',
title='Pages with Most Versions Rated',
figsize=(10, 8))
ax.set_xlabel('Number of Versions Rated')
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15,5))
for i, section, title in [(0, v, 'All Pages'),
(1, v.iloc[-100:], 'Pages w/ Most Versions Rated'),
(2, v.iloc[0:-100], 'Pages w/ Fewest Versions Rated')] :
section.hist(ax=axes[i],
normed=True,
bins=min(math.sqrt(section.count()), len(section.unique())),
color='lightskyblue',
alpha=0.7)
section.plot(ax=axes[i],
kind='kde',
xlim=(section.min(), section.max()),
title="%s (N=%d)" % (title, section.count()))
axes[i].set_xlabel("Number of Rated Versions for Page")
fig.suptitle("Distribution of Rated Versions per Page (Sample of 10,000 Pages)")
#fig.tight_layout()
Again, we have a Zipf's law-type distribution with a long right tail.
Ratings per version gives us a measure of the information we have for each version, i.e. how reliable the ratings are.
rpv = r / v
rpv.describe()
rpv.sort(ascending=False)
rpv.head(10)
This is interesting! Not what you'd expect. RPV can be conceived as the view velocity divided by the editing velocity, i.e. the number of views per edit (there're a lot of assumptions underlying that conception of course).
fig = plt.figure()
rpv.sort()
ax = rpv.tail(20).plot(
kind='barh',
title='Pages with Highest Ratings-Per-Version-Rated',
figsize=(10, 8))
ax.set_xlabel('Ratings per version')
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15,5))
for i, section, title in [(0, rpv, 'All Pages'),
(1, rpv.iloc[-100:], 'Pages w/ Highest RPV'),
(2, rpv.iloc[0:-100], 'Pages w/ Lowest RPV')] :
section.hist(ax=axes[i],
normed=True,
bins=math.sqrt(section.count()),
color='lightskyblue',
alpha=0.7)
section.plot(ax=axes[i],
kind='kde',
xlim=(section.min(), section.max()),
title="%s (N=%d)" % (title, section.count()))
axes[i].set_xlabel("Number of Ratings per Version Rated for Page")
fig.suptitle("Distribution of Ratings-Per-Version-Rated per Page (Sample of 10,000 Pages)")
#fig.tight_layout()
Note that, because this is a quotient, there are bumps at p/q
.
Let's look at the distribution of all ratings. First we'll look at ratings_all_mean
(the mean of all dimensions only when all dimensions are present, and ratings_any_mean
(the mean of any dimensions that are present). We'll also look at the number of dimensions rated.
#bins = [1,1.5,2,2.5,3,3.5,4,4.5,5]
#bins = [0.75,1.25, 1.75, 2.25, 2.75, 3.25, 3.75, 4.25, 4.75, 5.25]
bins=8
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(12, 5))
# Means
for i, var in enumerate(['rating_all_mean', 'rating_any_mean']):
df[var].dropna().hist(ax=axes[i],
normed=True,
bins=bins,
color='lightskyblue',
alpha=0.7)
df[var].dropna().plot(ax=axes[i],
kind='kde',
xlim=(1,5),
ylim=(0,1),
title=var)
# Number of dimensions per rating
dim_counts = df.groupby(['rating_dim_count']).page_id.count() / df.page_id.count()
dim_counts
#if True:
dim_counts.plot(ax=axes[2],
kind='bar',
color='lightskyblue',
alpha=0.7,
rot=0,
title="No. of Dimensions Rated")
axes[0].set_title("Ratings w/ All 4 Dim. Rated")
axes[0].set_xlabel("Mean of All Rated Dimensions")
axes[1].set_title("Ratings w/ Any Dim. Rated")
axes[1].set_xlabel("Mean of All Rated Dimensions")
axes[2].set_xlabel("Number of Dimensions Rated")
axes[2].set_ylabel("Proportion of All Ratings")
fig.suptitle("Distribution of Rating Values and Number of Dimensions Rated (Sample of 10,000 Pages)", va='bottom')
So, most people rate all 4 dimensions, and most articles are rated as 5
across all dimensions. There is also a significant minority who give all 1
s.
Why would a user only rate one (or two, or three) dimensions? Perhaps they feel they are not qualified to judge one or more aspects of an article. Or, perhaps they only care about particular dimensions; maybe, for example, a user doesn't believe "trustworthiness" is discernable from "objectivity", and thus they only rate one of those. Does the distribution of ratings change based on how many dimensions are rated? For example, perhaps people rate one dimension when they find something particularly egregious, and so they tend to give poor ratings.
Let's look at the distribution of rating_any_mean
by number of rated dimensions.
#bins = [1,1.5,2,2.5,3,3.5,4,4.5,5,5.5]
#bins = [0.75,1.25, 1.75, 2.25, 2.75, 3.25, 3.75, 4.25, 4.75, 5.25]
bins=8
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(16, 4))
for i, d in enumerate([1,2,3,4]):
axes[i].hist(df[df.rating_dim_count==d].rating_any_mean.values, normed=False, bins=bins, align='mid')
axes[i].set_title("Dimensions Rated = %d" % d)
fig.suptitle('Distribution of rating_any_mean',
size='large', weight='bold', va='bottom')
# Output description
df.groupby('rating_dim_count').rating_any_mean.describe().unstack()
So it appears that users who rate only one dimension tend to rate that dimension more highly. Let's do a t-test to make sure:
ttest_group(df, by='rating_dim_count', var='rating_any_mean', equal_var=False)
Now let's look at each particular dimension. Are users more forgiving on some dimensions vs. others?
dims = ['comp', 'obj', 'trust', 'writ']
nonmiss = df.loc[:,dims].count() / len(df)
nonmiss.name = 'pct not missing'
d = desc(df.loc[:,dims], percentiles=range(10,100,10))
pd.concat([pd.DataFrame(nonmiss), d], axis=1)
So it appears that "completeness" tends to be rated lower than other dimensions, while "well-written" tends to be rated higher. Also, fewer users rate "objectivity", while more users rate "well-written".
To look at whether these dimensions can be deemed to be different, let's do a Wilcoxon t-test (since we know the data is not normally distributed):
ttest_group(df, var=['comp','obj','trust','writ'], test='wilcoxon')
With a p-value of 0.153, we cannot reject the null hypothesis that "Objective" and "Well-written" are in fact measuring the same thing, though the p-value is hardly high enough to suggest they are in fact measuring the same thing.
Let's look at histograms of ratings for each dimension.
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(12, 4), sharey=False)
for i, d in enumerate(dims):
gs = df.groupby([d])
rtgs = df[d].dropna().count()
x = gs[d].count() / rtgs
x.plot(ax = axes[i], kind='bar', ylim=(0,.5), rot=0)
axes[i].set_title(d)
axes[i].set_xlabel("Rating Value")
axes[0].set_ylabel("Proportion of Ratings")
axes[0].set_title("Complete")
axes[1].set_title("Objective")
axes[2].set_title("Trustworthy")
axes[3].set_title("Well-written")
fig.suptitle('Distribution of Each Rating Dimension', va='bottom')
Are dimensions correlated? We already know that a lot of users give all 5
s. But perhaps some dimensions are more correlated with others?
for method in ['pearson', 'spearman']:
print "%s Correlation" % method
print df[dims].corr(method)
As we might expect, "objective" is correlated most highly with "trustworthy". Perhaps less expected, "complete" and "well-written" are also more correlated with each other than they are with the other dimensions. These are both true whether we look at normal Pearson correlation or whether we use Spearman's rho rank correlation coefficient. Still, all dimmensions are highly (positively) correlated with each other.
Ideally, ratings would cluster pretty closely for a particular page, and certainly for a particular version. If ratings are truly meaningful to the community as a whole (vs. to a certain individual or a small group), then users should be able to come to some sort of consensus about the quality of a particular page. We will explore this issue more deeply in section 3 (Page Rating Histories), but here we can look at some general statistics.
We'll start by looking at the distribution of ratings around the mean rating. First, I'll focus on ratings that have all four dimensions, since I suspect that such ratings will be of higher quality (on the assumption that someone rating all four dimensions is more likely to actually consider all dimensions -- though of course it is also possible that people who rate the same across the board are actually bringing less consideration to the process).
We will also only look at pages with at least 3 ratings, and then pages with at least 50 ratings, and finally with at least 300 ratings.
Group ratings by page and by version (revision), and calculate the mean and median of rating_all_mean
, as well as the count of ratings with all dimensions rated.
NB: rating_all_mean
is the mean of across all rating dimensions, while page_all_mean
(or page_comp_mean
, etc.) is the mean of rating_all_mean
(or comp
, etc.) across all observations for each page.
p_ratings = df.groupby('page_title')
df['page_all_mean'] = p_ratings.rating_all_mean.transform('mean') # the per-page mean of `rating_all_mean`
df['page_all_median'] = p_ratings.rating_all_mean.transform('median') # the per-page median of `rating_all_mean`
df['page_all_rate_count'] = p_ratings.rating_all_mean.transform('count') # the per-page count of non-null `rating_all_mean`
# i.e. the count of rows with all 4 dimensions
for dim in dims: # means for each rating dimension (comp/trust/obj/writ)
df['page_%s_mean' % dim] = p_ratings[dim].transform('mean')
#df['page_%s_median' % dim] = p_ratings[dim].transform('median')
# Do same as above, but this time grouping by VERSION (aka revision)
v_ratings = df.groupby('rev_id')
df['rev_all_mean'] = v_ratings.rating_all_mean.transform('mean')
df['rev_all_median'] = v_ratings.rating_all_mean.transform('median')
df['rev_all_rate_count'] = v_ratings.rating_all_mean.transform('count')
for dim in dims:
df['rev_%s_mean' % dim] = v_ratings[dim].transform('mean')
#df['rev_%s_median' % dim] = v_ratings[dim].transform('median')
For each page and version, compute difference variables for:
rating_all_mean
or a single rating dimension value) andfor unit in ['page', 'rev']:
for avg in ['mean', 'median']:
df['diff_%s_all_%s' % (unit, avg)] = df.rating_all_mean - df['%s_all_%s' % (unit, avg)]
df['absdif_%s_all_%s' % (unit, avg)] = abs(df.rating_all_mean - df['%s_all_%s' % (unit, avg)])
if avg == 'mean':
for dim in dims:
df['diff_%s_%s_%s' % (unit, dim, avg)] = df[dim] - df['%s_%s_%s' % (unit, dim, avg)]
df['absdif_%s_%s_%s' % (unit, dim, avg)] = abs(df[dim] - df['%s_%s_%s' % (unit, dim, avg)])
Let's look first at the mean absolute deviations (MADs) for each page/version, using both mean and median as our measure of central tendency.
desc(df[df.page_all_rate_count >= 2].filter(regex='absdif_page.*')).sort()
desc(df[df.rev_all_rate_count >= 2].filter(regex='absdif_rev.*')).sort()
So, the spread of ratings within each version is smaller than within each page, which makes sense. Still, the spread is relatively large, with each dimension, for example, being, on average, almost a whole point away from the mean. For cases where there are 2 ratings for a certain version, this means that, on average, those ratings are separated by nearly 2 whole points, since their mean is their midpoint: a 5 and a 3, say, or a 4 and a 2.
What happens when we change the cutoff?
First, let's look at the distribution of the counts of pages and versions with all dimensions rated.
desc(df.filter(like='all_rate_count'), float_decimals=1, percentiles=[5,10,25,50,75,90,95]).sort().T
desc(pd.DataFrame(df[df.rev_all_rate_count >= 2].groupby('rev_id').rev_all_rate_count.first()), float_decimals=1, percentiles=[5,10,25,50,75,90,95]).sort().T
Now, I'll plot the mean absolute deviations across cutoffs.
from matplotlib.ticker import FuncFormatter
lines = [] # list of SERIES of points, one for each of the lines illustrating change in AAD/MAD;
n = [] # series of counts of pages satisfying cutoff
count = len(df) # total obs. count
cutoffs = range(2,252)
# For each cutoff, create an entry in the lines list and the N list, the first corresponding
# to a Series with the means for each of the variables when the sample is restricted to that
# cutoff.
for cutoff in cutoffs:
# Add a series to the lines list
lines.append(df[df.page_all_rate_count >= cutoff].filter(regex='absdif_page.*').mean())
# Add the percentage of the sample cover to the n list
n.append(100 * len(df[df.page_all_rate_count >= cutoff])/float(len(df)))
mad = pd.DataFrame(lines, index=cutoffs) # concat all the Series together, indexing by the cutoffs list
n_line = pd.Series(n, name='n', index=cutoffs) # combine all the points in the list to a Series
fig, ax = plt.subplots(figsize=(10,6))
ax1 = mad.plot(ax=ax, linewidth=2, sort_columns=True,
title="Change in Mean Absolute Deviation based on Minimum Number of Ratings per Page",
style=['k:','c:','r-','g-','m-','b-'],
alpha=.8)
ax1.yaxis.set_label_text('Mean Absolute Deviation')
ax1.xaxis.set_label_text('Minimum Number of Ratings per Page')
ax1.legend(['MAD from Page Mean -\n Mean of all rating dimensions',
'MAD from Page Median -\n Mean of all rating dimensions',
'MAD from Page Mean -\n "Complete"',
'MAD from Page Mean -\n "Objective"',
'MAD from Page Mean -\n "Trustworthy"',
'MAD from Page Mean -\n "Well-Written"'],
loc="upper right",
bbox_to_anchor=(1.55,1))
ax2 = n_line.plot(ax=ax, secondary_y=True, label="% of Ratings in Sample (right axis)", alpha=.5, style='k--', )
ax2.yaxis.set_label_text('% of Ratings Satisfying Cutoff', rotation=-90)
ax2.yaxis.labelpad = 12
def pct_fmt(x, pos=0):
return '%d%%'%(x)
ax2.yaxis.set_major_formatter(FuncFormatter(pct_fmt))
ax2.legend(loc='lower left')
l = []
n = []
count = len(df)
cutoffs = range(2,52)
for cutoff in cutoffs:
l.append(df[df.rev_all_rate_count >= cutoff].filter(regex='absdif_rev.*').mean())
n.append(100 * len(df[df.rev_all_rate_count >= cutoff])/float(len(df)))
aad = pd.DataFrame(l, index=cutoffs)
n_line = pd.Series(n, name='n', index=cutoffs)
fig, ax = plt.subplots(figsize=(10,8))
ax1 = aad.plot(ax=ax, linewidth=2, sort_columns=True,
title="Change in Average Absolute Deviation based on Minimum Number of Ratings per Version")
ax1.yaxis.set_label_text('Average Absolute Deviation')
ax1.xaxis.set_label_text('Cutoff')
ax1.legend(loc="upper right")
ax2 = n_line.plot(ax=ax, secondary_y=True, label="N (right axis)", alpha=.5, style='k--', )
ax2.yaxis.set_label_text('% of Ratings Satisfying Cutoff', rotation=-90)
ax2.yaxis.labelpad = 16
def pct_fmt(x, pos=0):
return '%d%%'%(x)
ax2.yaxis.set_major_formatter(FuncFormatter(pct_fmt))
ax2.legend(loc='lower left')
It appears that restricting our sample to pages/versions with more observations initially increases then steadily decreases the spread of values around the mean/median. (More on what that might mean below.)
Here are the same statistics as presented in the above tables, only now for the non-absolute differences:
desc(df[df.page_all_rate_count >= 2].filter(like='diff_page')).sort()
desc(df[df.rev_all_rate_count >= 2].filter(like='diff_rev')).sort()
Note that the mean
column here is not meaningful (except perhaps for the diff_*_all_median
rows), since the mean of the difference from the mean is, of course, 0. Of greater interest is the interquartile range, which shows a consistent rightward skew, with the exception of the differences based on median.
Let's explore these distributions further with some histograms at various cutoffs.
cols = df.filter(like='diff_page').columns.values # list of columns starting with 'diff_page'
cols.sort()
for cutoff in [2, 93, 313]:
# Restrict data to:
# - columns of interest
# - rows that have all rating dimensions
# - only for pages with number of ratings above cutoff
d = df.loc[(df.rating_all_mean.notnull()) & (df.page_all_rate_count >= cutoff), cols]
# Create subplot for each col variable
fig, axes = plt.subplots(nrows=1, ncols=len(cols), figsize=(4*len(cols), 5), sharey=True)
# Loop through the columns, plot a histogram and also a kernal density plot on top
for i, var in enumerate(cols):
d[var].dropna().hist(ax=axes[i],
normed=True,
bins=math.log(d[var].count()),
color='lightskyblue',
alpha=0.7)
d[var].dropna().plot(ax=axes[i],
kind='kde',
linewidth=2,
xlim=(-5,5),
ylim=(0,1),
title=var)
fig.suptitle("Distribution of rating deviation from page-wide mean/median - At least %d ratings with all dimensions" % cutoff,
size='large', weight='bold', va='bottom')
So, all of these plots show a rightward skew, meaning that more people rate slightly higher than the mean, while some people rate significantly lower than the mean. This makes sense given the fact that the mean rating is fairly high and the rating scale is truncated.
The more meaningful factor is the spread, giving an indication of how consistent the ratings are. There is a consistent bimodal pattern here, which could indicate that ratings are not, in fact, centered on a single mean. There may be, say, two types of people rating articles, with each type having significantly different evaluation metrics and models.
Another aspect to note is that pages with greater numbers of ratings do not seem to lose this bimodality; if anything, the bimodality is strengthened, indicating that it is not a problem of low N, but rather a consistent pattern. In fact, it is even possible that articles with many ratings are more controversial, meaning a greater variance (indicated in this case by bimodality).
Looking at versions (using a different set of cutoffs), we see similar patterns:
cols = df.filter(like='diff_rev').columns.values
cols.sort()
cols
titles = {'diff_rev_all_mean': 'Deviation from Version Mean \n Mean of All Dim.',
'diff_rev_all_median': 'Deviation from Version Median \n Mean of All Dim.',
'diff_rev_comp_mean': 'Deviation from Version Mean \n "Complete"',
'diff_rev_obj_mean': 'Deviation from Version Mean \n "Objective"',
'diff_rev_trust_mean': 'Deviation from Version Mean \n "Trustworthy"',
'diff_rev_writ_mean': 'Deviation from Version Mean \n "Well-Written"'}
for cutoff in [2, 4, 6]:
d = df.loc[(df.rating_all_mean.notnull()) & (df.rev_all_rate_count >= cutoff), cols]
fig, axes = plt.subplots(nrows=1, ncols=len(cols), figsize=(4*len(cols), 5), sharey=False)
for i, var in enumerate(cols):
d[var].dropna().hist(ax=axes[i],
normed=True,
bins=math.log(d[var].count()),
color='lightskyblue',
alpha=0.7)
d[var].dropna().plot(ax=axes[i],
kind='kde',
linewidth=2,
xlim=(-5,5),
ylim=(0,1))
axes[i].set_title(titles[var], size='x-large')
axes[i].set_ylabel("", size='x-large')
axes[i].tick_params(axis='both', which='major', labelsize='x-large')
axes[0].set_ylabel("Density")
fig.suptitle("Distribution of rating deviation from version-wide mean/median - "
"At least %d ratings with all dimensions per version" % cutoff,
size='xx-large', weight='bold', va='bottom', y=1)
If we assume that logged-in users are more savvy about Wikipedia standards and the stated goals of each page, then it might be true that such logged-in users would have more "accurate" (in the sense of reflecting the a priori standards of Wikipedia) ratings. They might also be expected to have ratings that are more closely linked to actual knowledge about the topics.
Let's plot the differences in the distributions of ratings based on whether the user is logged in or not.
titles = {'rating_all_mean': 'Mean,\n All Dim. Rated',
'rating_any_mean': 'Mean,\n Any Dim. Rated',
'comp': '"Complete"',
'obj': '"Objective"',
'trust': '"Trustworthy"',
'writ': '"Well-Written"'}
#bins = [1,1.5,2,2.5,3,3.5,4,4.5,5,5.5]
#bins = [0.75,1.25, 1.75, 2.25, 2.75, 3.25, 3.75, 4.25, 4.75, 5.25]
bins=8
pcols = ['rating_all_mean', 'rating_any_mean'] # plot columns
dims = ['comp', 'obj', 'trust', 'writ'] # dimensions of rating
pcols.extend(dims)
fig, axes = plt.subplots(nrows=2, ncols=len(pcols), figsize=(15, 6), sharex=False, sharey=False)
login_gs = df.groupby('logged_in')
# For each row (key, data) in the logged_in-groups
for r, (k, d) in enumerate(login_gs.groups.items()):
# for each column, variable
for c, var in enumerate(pcols):
# For continuous variables (with >10 unique values), make histogram
if len(df.loc[d, var].unique()) > 10:
df.loc[d, var].hist(ax=axes[r, c], normed=True, bins=bins, align='mid')
if r == 0:
axes[r,c].set_title(titles[var], size='large')
# For discrete variables, just plot frequency of each option
else:
gs = df.loc[d].groupby(var) # group by the variable
rtgs = df.loc[d, var].dropna().count()
x = gs[var].count() / rtgs
x.plot(ax = axes[r, c], kind='bar', ylim=(0,.5), rot=0)
if r == 0:
axes[r, c].set_title(titles[var], size='x-large')
axes[r, c].set_xlabel("")
# If first column, set y-label to logged in value
if c == 0:
axes[r, c].set_ylabel("Logged-In = %d" % k)
fig.suptitle('Proportion of Each Rating, Based on Logged-in Status', size='x-large', va='bottom')
fig.tight_layout()
So, logged-in users appear to be less likely to give ratings of 1.
print "Logged in"
print desc(df[(df.page_all_rate_count >= 2) & (df.logged_in==1)].filter(regex='absdif_page.*')).sort()
print "Not logged in"
print desc(df[(df.page_all_rate_count >= 2) & (df.logged_in==0)].filter(regex='absdif_page.*')).sort()
The average absolute difference for ratings across pages is lower for users that are logged in.
print "Logged in"
print desc(df[(df.rev_all_rate_count >= 2) & (df.logged_in==1)].filter(regex='absdif_rev.*')).sort()
print "Not logged in"
print desc(df[(df.rev_all_rate_count >= 2) & (df.logged_in==0)].filter(regex='absdif_rev.*')).sort()
The average absolute difference for ratings across versions is also lower for users that are logged in. Let's do some t-test to see if this difference in means is significant.
cols = df.filter(like="absdif").columns.values
cols.sort()
for var in cols:
print ttest_group(df.loc[(df.rev_all_rate_count >= 2), [var, 'logged_in']].dropna(),
by='logged_in', var=var)
So, the data appears to support the notion that logged-in users produce more reliable (or, at least, consistent) ratings.
Before we can look at the page rating history, we need a sense of a "version" that is consistent across pages. The most sensible solution at this point (since we haven't merged in actual revision data from the Wiki DB) is to use nth_version
. Thus, each rating can be seen as a piece of information about a particular version, with the versions themselves being the true "observations".
First we merge in the RPV values for each rating.
df = pd.merge(left=df, right=pd.DataFrame(rpv, columns=['rpv']), left_on='page_title', right_index=True)
df = df.drop('Unnamed: 0', 1)
df.info()
Now we need to create the nth_version
variable to index versions.
versions = df[['page_title','rev_id','nth_rating']].groupby(['page_title', 'rev_id']).first().reset_index()
versions['nth_version'] = versions.groupby(['page_title']).cumcount() + 1
Now merge versions into the main dataframe.
df = pd.merge(left=df, right=versions.drop('nth_rating', 1), left_on=['page_title', 'rev_id'], right_on=['page_title', 'rev_id'])
To look more closely at the data, we will examine frequently edited, frequently rated (FEFR) pages:
These pages have some amount of history, with a fair amount of information for each version in the history. Note, however, that both the number of rated versions (i.e. versions that are in our data) and the ratings per rated version (RPV) are not necessarily exogenous; in other words, it is quite possible that pages with many versions (and thus many edits), or high RPV, may be different in key and thusfar unobserved ways from other pages. For example, these pages might be more controversial; we can imagine that controversial pages would cause people to rate pages they strongly disagree with and also to edit those pages, thus affecting both measures I use to define the sample. However, it is difficult, for example, to have confidence in the mean rating if there are only a handful of ratings for a particular page version. Thus, I'll leave the investigation of how representative these frequently edited, frequently rated pages are of Wikipedia pages in general until a later stage, when I have incorporated data from the Wiki DB itself.
df['page_rev_count'] = df.groupby('page_title').nth_version.transform('max')
fe_rpv = df[df.page_rev_count >= 5].groupby('page_title').rpv.first()
fe_rpv.sort(ascending=False)
fefr = df.set_index('page_title').loc[fe_rpv.head(20).index].reset_index()
Here is a list of the 20 FEFR pages:
fefr.sort(['page_title', 'nth_version', 'nth_rating'])
sorted_fefr = fefr.groupby('page_title')[['page_rev_count', 'page_rate_count', 'rpv']].mean().sort('page_rev_count')
sorted_fefr
var = 'rating_any_mean'
axes = []
for title in sorted_fefr.index[0:]:
d = fefr.loc[fefr.page_title==title, ['page_rev_count', var, 'nth_version']]
prc = d.page_rev_count.mean()
ax = d.boxplot(var, by='nth_version', figsize=(prc/3, 3), grid=False)
#groups = d.groupby('nth_version').groups.keys()
#keys = range(len(ax))
#print keys
#cl = ax.get_children()
#cl=[item for item in cl if isinstance(item, matplotlib.lines.Line2D)]
#print cl
ax.set_ylim([1,5])
ax.set_title(title, ha='left')
plt.suptitle("")
mean = fefr[fefr.page_title==title].groupby('nth_version')[var].mean()
#median = fefr[fefr.page_title==title].groupby('nth_version')[var].median()
mean.plot(ax=ax, style='bo-', linewidth=1, alpha=.8, mfc='none', grid=False)
#median.plot(ax=ax, style='r--')
axes.append(ax)
print "Page Rating History, by Version", "Mean is traced in blue"
There does not appear to be a consistent improvement over time. Let's check this for the whole sample, first by looking at the correlation.
df[['rating_any_mean', 'rating_all_mean', 'comp', 'obj', 'trust', 'writ', 'nth_version']].corr()
Indeed, there is very little correlation at all between nth_version
and any of the ratings dimensions/averages, indicating that according to the page ratings, there is little indication that ratings improve significantly with additional edits, all other things being equal. Note that, in order to not resort to ceteris paribus (all other things being equal), i.e. in order to actually look at the effects of different kinds of edits, I'll need to bring in data on the actual edits, using MediaWiki's API.
from wikitools import wiki, api
site = wiki.Wiki('http://en.wikipedia.org/w/api.php')
#fefr.sort(['page_title', 'rev_id'])
first_revs = fefr.groupby('page_title').rev_id.min()
last_revs = fefr.groupby('page_title').rev_id.max()
page_ids = fefr.groupby('page_title').page_id.first()
pd.DataFrame([first_revs, last_revs], index=['first', 'last']).T
Set up the API call, and run it for every page. (For some reason, wikitools won't allow pulling
def get_revisions(title, rvstartid=None, rvendid=None, rvcontinue=None):
"""Get a list of revision json-objects for a page, starting at a certain rvstartid/rvcontinueid"""
params = {'action':'query',
'titles': title,
'prop': 'revisions',
'rvprop': 'ids|flags|timestamp|user|userid|size|comment|parsedcomment|content|tags|flagged',
'rvdir': 'newer',
'rvstartid': rvstartid,
'rvendid': rvendid,
'rvlimit': 50}#,
#'rvcontinue': rvcontinue}
if rvcontinue != None:
params['rvcontinue'] = rvcontinue
request = api.APIRequest(site, params)
result = request.query(querycontinue=False)
return result
def get_page_revisions(title, startid, endid, pid):
"""
Get all page revisions for a given page, starting at a startid and ending at endid.
This will call get_revisions until the returned object contains no 'query-continue' element.
It returns a list of revision json-objects.
"""
query_res = get_revisions(title, startid, endid)
page_res = query_res['query']['pages']
if pid in page_res:
#print page_res
result = page_res[pid]['revisions']
else:
return []
#print query_res.keys()
while 'query-continue' in query_res:
print "Continued at: %s" % query_res['query-continue']['revisions']['rvcontinue']
if 'warnings' in query_res:
print "Query warning: %s" % query_res['warnings']
query_res = get_revisions(title,
startid,
endid,
query_res['query-continue']['revisions']['rvcontinue'])
#print "Query keys: %s" % query_res.keys()
page_res = query_res['query']['pages']
if pid in page_res:
result.extend(page_res[pid]['revisions'])
return result
Go through each page title and get a list of revision objects. Then, create a dataframe from those objects, and put it in a dict (revs
).
revs = {}
for title in first_revs.index:
startid = first_revs[title]
endid = last_revs[title]
pid = str(page_ids[title])
revs[title] = pd.DataFrame(get_page_revisions(title, startid, endid, pid))
print "%s: %d" % (title, len(revs[title]))
# Concat all the revisions into a single dataframe, which we can merge in
revdf = pd.concat(revs.values())
fefr.sort('page_rev_count', ascending=False)
foo = fefr.groupby('page_title', sort=True).page_title.first()
print foo
for title in foo:
nversions = len(fefr.loc[fefr.page_title==title, 'nth_version'].unique())
print title, nversions
#ax = fefr[fefr.page_title==title].boxplot('rating_all_mean', by='nth_version', figsize=(15,3))
#ax.set_ylim((1,5))