In [1]:
import numpy as np
import pandas as pd
from functools import partial
from pylab import *
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline
In [2]:
def ttest_group(data, var, by=None, test='ind', equal_var=False):
    """Runs a t-test on each combination of values for the by-variable
    
    data: data to do test on
    var: either a variable name, if by is specified, or an array
    by: variable to construct groups from
    test: type of test to run: 'ind' - ttest_ind, 'rel' - ttest_rel, 'wilcoxon' - wilcoxon
    equal_var: whether to assume equal variance (otherwise use Welch's)
    """

    if by != None:
        g = data.groupby(by)[var]
        keys = g.groups.keys()
        res = pd.DataFrame(data=np.empty(shape=(len(keys), 2*len(keys))),
                       index=g.groups.keys(),
                       columns=pd.MultiIndex.from_product([['t stat.','p-value'], keys], names=[var, by]),
                       dtype='object')
    else:
        keys = var
        res = pd.DataFrame(data=np.empty(shape=(len(keys), 2*len(keys))),
                       index=keys,
                       columns=pd.MultiIndex.from_product([['t stat.','p-value'], keys]),
                       dtype='object')
    #res.index.name = by
    #res.columns.name = by
    for key1 in keys:
        for key2 in keys:
            g1 = g.get_group(key1) if by != None else data.loc[(data[key1].notnull()) & (data[key2].notnull()), key1]
            g2 = g.get_group(key2) if by != None else data.loc[(data[key1].notnull()) & (data[key2].notnull()), key2]
            
        
            if key1 == key2: # can't do t-test of a variable on itself
                res.loc[key1, ('t stat.', key2)] = "x"
                res.loc[key1, ('p-value', key2)] = "x"
            else:
                if test=='ind':
                    res.loc[key1, ('t stat.', key2)] = "%0.2f" % stats.ttest_ind(g1, 
                                                                                 g2, 
                                                                                 equal_var=equal_var)[0]
                    res.loc[key1, ('p-value', key2)] = "%0.3f" % stats.ttest_ind(g1, 
                                                                                 g2, 
                                                                                 equal_var=equal_var)[1]
                elif test=='rel': # relative
                    res.loc[key1, ('t stat.', key2)] = "%0.2f" % stats.ttest_rel(g1, 
                                                                                 g2)[0]
                    res.loc[key1, ('p-value', key2)] = "%0.3f" % stats.ttest_rel(g1, 
                                                                                 g2)[1]
                    
                elif test=='wilcoxon':
                    res.loc[key1, ('t stat.', key2)] = "%0.2f" % stats.wilcoxon(g1, 
                                                                                 g2)[0]
                    res.loc[key1, ('p-value', key2)] = "%0.3f" % stats.wilcoxon(g1, 
                                                                                 g2)[1]
    return res
    
In [3]:
import warnings
def desc(data, 
         float_decimals=3, 
         stats=['count','mean','std','min','percentiles','max'], percentiles=[25,50,75]):
    """Get a DataFrame with statistical aggregations of a DataFrame
    Basically a version of pd.DataFrame.describe() that allows you to customize 
    the statistics by specifying functions to apply to each column.
    """
    
    # Create empty columns list
    cs = []
    # Calculate each of the statistics
    for stat in stats:
        if isinstance(stat, str):
            if stat == 'count':
                cs.append(pd.DataFrame(data.count(), columns=['count'])) 
            elif stat == 'mean':
                cs.append(pd.DataFrame(data.mean(), columns=['mean'])) 
            elif stat == 'std':
                cs.append(pd.DataFrame(data.std(), columns=['std']))
            elif stat == 'min':
                cs.append(pd.DataFrame(data.min(), columns=['min'])) 
            elif stat == 'max':
                cs.append(pd.DataFrame(data.max(), columns=['max']))
            elif stat == 'percentiles' or stat == 'pct':
                pctiles = data.dropna().apply(partial(np.percentile, q=percentiles))
                cs.append(pd.DataFrame.from_records(pctiles.tolist(),
                                                    index = pctiles.index,
                                                    columns = map(lambda x: '%d%%' % x, percentiles))) # add "%" to end of each element
            else:
                warnings.warn("No known procedure for statistic '%s'; "
                              "only 'count', 'mean', 'std', 'min', 'percentiles' or 'pct', and 'max' are supported" % stat)
        else:
            cs.append(data.apply(stat))
    
    # Concat all the stat DataFrames
    d = pd.concat(cs, axis=1)
    # Change float columns to formatted strings (to reduce number of decimals according to float_decimals)
    for c in d.loc[:, d.dtypes == np.float].columns:
        d[c] = d[c].apply(lambda x: ('{:0.%df}' % float_decimals).format(x))
    return d

Introduction

What defines a good or effective article on Wikipedia?

But to ask that question, we must also ask: What is the purpose of a Wikipedia article? But again, this question forces another: What do people go to Wikipedia for? For, if we know why users use Wikipedia, it makes sense to then say that the purpose of its articles is to satisfy the needs of the users, and thus an effective article is one which satisfies the needs of those who visit it. (It should be noted, explicitly, of course, that I am here considering the users to primarily be those who read Wikipedia, rather than those who edit it—though of course editors are also users in this sense, just a small proportion of them; if Wikipedia is conceived of as primarily a community of editors, a sort of virtual hobby whose purpose is to satisfy the hobbyists, then the above questions become much less significant and, I believe, so does Wikipedia itself.)

I argue that Wikipedia users go to Wikipedia for reasons that, often, could be roughly broken down into 3 categories:

  1. To learn: a user is curious about a concept, or they are researching an assignment, or they have come across something they don't understand. These users want a brief description of what they are looking for, a sort of long-form definition. Often, such users are not experts in the article's broader field, and thus are looking for a description that is understandable for the layperson:

    • It does not exessively use vocabulary that is not defined or introduced clearly.
    • It does not define the subject of the article solely based upon other concepts with which the user is similarly unfamiliar.
    • It clearly explains not only what the subject is, but why it is significant in the broader context (an area that, when I have been using Wikipedia in this way, I have found to be lacking).
  2. To find information: a user is looking for a particular fact or detail—Who is so-and-so married to? When did so-and-so die? When was the Battle of Such-and-such? This of course could be seen as a subset of #1 (and in fact so could all uses), but I distinguish it because, in this case, the user knows what he or she wants to find. They know that So-and-so is a public figure, they just don't know who So-and-so's spouse is. These users often don't need to read the articles themselves: much of the factual information is summarized in the infoboxes. What is the electronegativity of platinum? Note that such a user would never search for that information if they did not already know:

    • What platinum is
    • What electronegativity is
    • Why one would need to know the electronegativity of an element
  3. To remember what has been forgotten: a user is refreshing their understanding of a subject. Again, this could be a subset of both #1 and #2, but I distinguish it because, unlike #1, the user is already quite familiar with the material, and unlike #2, the user is looking for something that is not a clear-cut fact which could be listed in an infobox. For example: When do I use a one-tailed t-test vs. a two-tailed one? Where was the epigraph to The Waste Land from? How many of Wittgenstein's brothers committed suicide? What was the precipitating event of the Opium Wars? This user will make use of the section headings and perhaps look for a certain phrase in the page, skimming until the information is found. Again, this user knows a fair amount about the subject, as well as how the subject relates to the broader context.

Though all related, these use cases for Wikipedia are, I think, quite distinct, insofar as each will be looking in different parts of the article (e.g. #1 will read the introduction more than #2 or #3 will, #2 will consult the infobox primarily, while the others may ignore it, and #3 may pay close attention to the contents to find the relevant section). Each will also judge the article differently. #1 will likely value clarity of writing, as well as comprehensiveness, but may not have the background to really judge how objective or trustworth an article is (though of course they will notice the extent to which citations are used, etc.). #2 may not read any of the writing, but will notice if key facts are left out. #3 will rely extensively on the article being well organized as well as complete and comprehensive.

Which leads me to: How can (should) we interpret readers' subjective ratings of Wikipedia articles? Can the overall quality of an article be measured by looking at the mean of various readers' ratings? Is there even such a thing as overall quality in this context?

  1. Are ratings meaningful? Does the rating of one user correlate to the rating of other users?
  2. Are the rating dimensions meaningful? Do people actually judge separate components of an article, or do they simply record a sort of binary good/bad, all-1s/all-5s opinion?
  3. Do articles get better over time?
  4. Does the size (or other measure of magnitude—perhaps reputation of the editor, e.g.) of an edit affect how it changes an article's rating? Does size, for example, improve completeness more than it does trustworthiness or writing quality? Does the number of citations make an article more trustworthy or objective?

Load data

In [4]:
df = pd.read_csv('cleaned_wiki_ratings.csv')
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 78524 entries, 0 to 78523
Data columns (total 24 columns):
Unnamed: 0          78524 non-null int64
rev_id              78524 non-null float64
nth_rating          78524 non-null int64
page_id             78524 non-null int64
page_title          78524 non-null object
logged_in           78524 non-null float64
datetime            78524 non-null object
comp                60263 non-null float64
obj                 57496 non-null float64
trust               61993 non-null float64
writ                65568 non-null float64
rating_dim_count    77737 non-null float64
rating_any_mean     77737 non-null float64
rating_all_mean     53303 non-null float64
page_rate_count     78524 non-null int64
rev_rate_count      78524 non-null float64
page_comp_mean      77283 non-null float64
rev_comp_mean       72232 non-null float64
page_obj_mean       76953 non-null float64
rev_obj_mean        71014 non-null float64
page_trust_mean     77490 non-null float64
rev_trust_mean      72943 non-null float64
page_writ_mean      77450 non-null float64
rev_writ_mean       74095 non-null float64
dtypes: float64(18), int64(4), object(2)

1. Ratings per page/Ratings per version

1a - Which page has the most ratings?

In [5]:
r = df.groupby('page_title').page_rate_count.max()
r.describe()
Out[5]:
count    10000.000000
mean         7.852400
std         42.055952
min          1.000000
25%          1.000000
50%          2.000000
75%          5.000000
max       2713.000000
Name: page_rate_count, dtype: float64
In [6]:
r.sort(ascending=False)
r.head(10)
Out[6]:
page_title
The_Hunger_Games                             2713
List_of_spells_in_Harry_Potter               1685
Abraham_Lincoln                              1389
Marilyn_Monroe                                663
Dhirubhai_Ambani                              596
OSI_model                                     567
Knights_Templar                               553
Arithmetic_progression                        442
United_States_Declaration_of_Independence     439
Business_development                          408
Name: page_rate_count, dtype: int64

Ha, the Hunger Games! Not very surprising. Some of the others are interesting, though! (Knights Templar?!)

NB: Of course, this is a sample, so this does not really say much about the world itself.

Let's look at a plot of the distribution:

In [7]:
fig = plt.figure()
r.sort() # sort ascending and take the tail so that the top bar on the plot is the highest of the group
ax = r.tail(20).plot(
    kind='barh',
    title='Most Rated Pages (in sample of 10,000 pages)',
    figsize=(10, 8))
ax.set_xlabel('Number of Ratings')
Out[7]:
<matplotlib.text.Text at 0x10073b910>

And let's look at the overall distribution.

In [8]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18,5))

for i, section, title in [(0, r, 'All Pages'), 
                (1, r.iloc[-100:], 'Pages w/ Most Ratings'),
                (2, r.iloc[0:-100], 'Pages w/ Fewest Ratings')] :
    section.hist(ax=axes[i], 
           normed=True,
           bins=math.sqrt(section.count()),
           color='lightskyblue',
           alpha=0.7)
    section.plot(ax=axes[i],
                 kind='kde',
                 xlim=(section.min(), section.max()),
                 title="%s (N=%d)" % (title, section.count()))
    axes[i].set_xlabel("Number of Ratings of Page")

fig.suptitle("Distribution of Ratings per Page (Sample of 10,000 Pages)")
#fig.tight_layout()
Out[8]:
<matplotlib.text.Text at 0x10282e710>

The right-tailed distribution is hardly surprising, though the drop-off is quite sharp, as the most-rated pages tend to be rated many times more than the next-most-rated.


1b - Which pages have the most rated versions?

More versions means that we have more rated edits, and thus can trace the article quality over a greater history.

NB: I distinguish between "versions" and "rated versions" (or in some cases "versions rated") because I am using data from the ratings dataset only, at this stage. Any versions of the page that don't have any ratings are thus invisible in this dataset; there is no way, for example, to look at the number of edits made to a page, only the number of edits that have been rated.

In [9]:
v = df.groupby('page_title').rev_id.unique().apply(lambda x: x.size)
v.describe()
Out[9]:
count    10000.000000
mean         3.661200
std         10.193802
min          1.000000
25%          1.000000
50%          2.000000
75%          3.000000
max        580.000000
dtype: float64
In [10]:
v.sort(ascending=False)
v.head(10)
Out[10]:
page_title
The_Hunger_Games                      580
Abraham_Lincoln                       250
Marilyn_Monroe                        169
MDNA_(album)                          157
OSI_model                             145
The_Amazing_Spider-Man_(2012_film)    135
Knights_Templar                       134
House_of_Night                        130
Thirteen_Reasons_Why                  129
Ice_hockey                            124
dtype: int64

Some clear similarities with 1a.

In [11]:
fig = plt.figure()
v.sort()
ax = v.tail(20).plot(
    kind='barh',
    title='Pages with Most Versions Rated',
    figsize=(10, 8))
ax.set_xlabel('Number of Versions Rated')
Out[11]:
<matplotlib.text.Text at 0x1088962d0>
In [12]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15,5))

for i, section, title in [(0, v, 'All Pages'), 
                (1, v.iloc[-100:], 'Pages w/ Most Versions Rated'),
                (2, v.iloc[0:-100], 'Pages w/ Fewest Versions Rated')] :
    section.hist(ax=axes[i], 
           normed=True,
           bins=min(math.sqrt(section.count()), len(section.unique())),
           color='lightskyblue',
           alpha=0.7)
    section.plot(ax=axes[i],
                 kind='kde',
                 xlim=(section.min(), section.max()),
                 title="%s (N=%d)" % (title, section.count()))
    axes[i].set_xlabel("Number of Rated Versions for Page")

fig.suptitle("Distribution of Rated Versions per Page (Sample of 10,000 Pages)")
#fig.tight_layout()
Out[12]:
<matplotlib.text.Text at 0x1077b3950>

Again, we have a Zipf's law-type distribution with a long right tail.


1c - Ratings Per Version

Ratings per version gives us a measure of the information we have for each version, i.e. how reliable the ratings are.

In [13]:
rpv = r / v
rpv.describe()
Out[13]:
count    10000.000000
mean         1.626829
std          1.403800
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max         40.119048
dtype: float64
In [14]:
rpv.sort(ascending=False)
rpv.head(10)
Out[14]:
page_title
List_of_spells_in_Harry_Potter         40.119048
Syn_and_anti_addition                  30.000000
Tool_pusher                            25.000000
Alan_Questel                           23.200000
MS_Contin                              22.500000
Group_technology                       22.000000
Arc_elasticity                         21.000000
The_Other_Side_of_the_Sky:_A_Memoir    20.000000
Cultural_lag                           17.000000
Cultural_institutions                  17.000000
dtype: float64

This is interesting! Not what you'd expect. RPV can be conceived as the view velocity divided by the editing velocity, i.e. the number of views per edit (there're a lot of assumptions underlying that conception of course).

In [15]:
fig = plt.figure()
rpv.sort()
ax = rpv.tail(20).plot(
    kind='barh',
    title='Pages with Highest Ratings-Per-Version-Rated',
    figsize=(10, 8))
ax.set_xlabel('Ratings per version')
Out[15]:
<matplotlib.text.Text at 0x1088c7450>
In [16]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15,5))

for i, section, title in [(0, rpv, 'All Pages'), 
                (1, rpv.iloc[-100:], 'Pages w/ Highest RPV'),
                (2, rpv.iloc[0:-100], 'Pages w/ Lowest RPV')] :
    section.hist(ax=axes[i], 
           normed=True,
           bins=math.sqrt(section.count()),
           color='lightskyblue',
           alpha=0.7)
    section.plot(ax=axes[i],
                 kind='kde',
                 xlim=(section.min(), section.max()),
                 title="%s (N=%d)" % (title, section.count()))
    axes[i].set_xlabel("Number of Ratings per Version Rated for Page")

fig.suptitle("Distribution of Ratings-Per-Version-Rated per Page (Sample of 10,000 Pages)")
#fig.tight_layout()
Out[16]:
<matplotlib.text.Text at 0x10886d950>

Note that, because this is a quotient, there are bumps at p/q.


2. Page Ratings

2a - What is the distribution of page ratings?

Let's look at the distribution of all ratings. First we'll look at ratings_all_mean (the mean of all dimensions only when all dimensions are present, and ratings_any_mean (the mean of any dimensions that are present). We'll also look at the number of dimensions rated.

In [17]:
#bins = [1,1.5,2,2.5,3,3.5,4,4.5,5]
#bins = [0.75,1.25, 1.75, 2.25, 2.75, 3.25, 3.75, 4.25, 4.75, 5.25]
bins=8
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(12, 5))
# Means
for i, var in enumerate(['rating_all_mean', 'rating_any_mean']):
    df[var].dropna().hist(ax=axes[i],
                            normed=True,
                            bins=bins,
                            color='lightskyblue',
                            alpha=0.7)
    df[var].dropna().plot(ax=axes[i],
                            kind='kde',
                            xlim=(1,5),
                            ylim=(0,1),
                            title=var)

# Number of dimensions per rating
dim_counts = df.groupby(['rating_dim_count']).page_id.count() / df.page_id.count()
dim_counts
#if True:
dim_counts.plot(ax=axes[2],
                    kind='bar',
                    color='lightskyblue',
                    alpha=0.7,
                    rot=0,
                    title="No. of Dimensions Rated")

axes[0].set_title("Ratings w/ All 4 Dim. Rated")
axes[0].set_xlabel("Mean of All Rated Dimensions")
axes[1].set_title("Ratings w/ Any Dim. Rated")
axes[1].set_xlabel("Mean of All Rated Dimensions")
axes[2].set_xlabel("Number of Dimensions Rated")
axes[2].set_ylabel("Proportion of All Ratings")
fig.suptitle("Distribution of Rating Values and Number of Dimensions Rated (Sample of 10,000 Pages)", va='bottom')
Out[17]:
<matplotlib.text.Text at 0x1088d5410>

So, most people rate all 4 dimensions, and most articles are rated as 5 across all dimensions. There is also a significant minority who give all 1s.


2b What does it mean if a user does not rate all dimensions?

Why would a user only rate one (or two, or three) dimensions? Perhaps they feel they are not qualified to judge one or more aspects of an article. Or, perhaps they only care about particular dimensions; maybe, for example, a user doesn't believe "trustworthiness" is discernable from "objectivity", and thus they only rate one of those. Does the distribution of ratings change based on how many dimensions are rated? For example, perhaps people rate one dimension when they find something particularly egregious, and so they tend to give poor ratings.

Let's look at the distribution of rating_any_mean by number of rated dimensions.

In [18]:
#bins = [1,1.5,2,2.5,3,3.5,4,4.5,5,5.5]
#bins = [0.75,1.25, 1.75, 2.25, 2.75, 3.25, 3.75, 4.25, 4.75, 5.25]
bins=8
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(16, 4))

for i, d in enumerate([1,2,3,4]):
    axes[i].hist(df[df.rating_dim_count==d].rating_any_mean.values, normed=False, bins=bins, align='mid')
    axes[i].set_title("Dimensions Rated = %d" % d)

fig.suptitle('Distribution of rating_any_mean',
             size='large', weight='bold', va='bottom')

# Output description
df.groupby('rating_dim_count').rating_any_mean.describe().unstack()
Out[18]:
count mean std min 25% 50% 75% max
rating_dim_count
1 19535 3.841157 1.498986 1 3.000000 5 5.000000 5
2 2124 3.436441 1.478599 1 2.000000 4 5.000000 5
3 2775 3.531291 1.318520 1 2.666667 4 4.666667 5
4 53303 3.658185 1.343684 1 2.750000 4 5.000000 5

4 rows × 8 columns

So it appears that users who rate only one dimension tend to rate that dimension more highly. Let's do a t-test to make sure:

In [19]:
ttest_group(df, by='rating_dim_count', var='rating_any_mean', equal_var=False)
Out[19]:
rating_any_mean t stat. p-value
rating_dim_count 1 2 3 4 1 2 3 4
1 x 11.96 11.38 14.99 x 0.000 0.000 0.000
2 -11.96 x -2.33 -6.80 0.000 x 0.020 0.000
3 -11.38 2.33 x -4.94 0.000 0.020 x 0.000
4 -14.99 6.80 4.94 x 0.000 0.000 0.000 x

4 rows × 8 columns


2c - What are the distributions of each dimension?

Now let's look at each particular dimension. Are users more forgiving on some dimensions vs. others?

In [20]:
dims = ['comp', 'obj', 'trust', 'writ']
nonmiss = df.loc[:,dims].count() / len(df)
nonmiss.name = 'pct not missing'
d = desc(df.loc[:,dims], percentiles=range(10,100,10))
pd.concat([pd.DataFrame(nonmiss), d], axis=1)
Out[20]:
pct not missing count mean std min 10% 20% 30% 40% 50% 60% 70% 80% 90% max
comp 0.767447 60263 3.400 1.547 1.000 1.000 2.000 3.000 3.000 4.000 4.000 5.000 5.000 5.000 5.000
obj 0.732209 57496 3.713 1.490 1.000 1.000 2.000 3.000 4.000 4.000 5.000 5.000 5.000 5.000 5.000
trust 0.789478 61993 3.719 1.516 1.000 1.000 2.000 3.000 4.000 4.000 5.000 5.000 5.000 5.000 5.000
writ 0.835006 65568 3.813 1.413 1.000 1.000 2.000 3.000 4.000 4.000 5.000 5.000 5.000 5.000 5.000

4 rows × 15 columns

So it appears that "completeness" tends to be rated lower than other dimensions, while "well-written" tends to be rated higher. Also, fewer users rate "objectivity", while more users rate "well-written".

To look at whether these dimensions can be deemed to be different, let's do a Wilcoxon t-test (since we know the data is not normally distributed):

In [21]:
ttest_group(df, var=['comp','obj','trust','writ'], test='wilcoxon')
Out[21]:
t stat. p-value
comp obj trust writ comp obj trust writ
comp x 71145895.00 68851643.00 63112671.00 x 0.000 0.000 0.000
obj 71145895.00 x 82582569.00 111988267.50 0.000 x 0.000 0.153
trust 68851643.00 82582569.00 x 104132147.00 0.000 0.000 x 0.000
writ 63112671.00 111988267.50 104132147.00 x 0.000 0.153 0.000 x

4 rows × 8 columns

With a p-value of 0.153, we cannot reject the null hypothesis that "Objective" and "Well-written" are in fact measuring the same thing, though the p-value is hardly high enough to suggest they are in fact measuring the same thing.

Let's look at histograms of ratings for each dimension.

In [22]:
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(12, 4), sharey=False)

for i, d in enumerate(dims):
    gs = df.groupby([d])
    rtgs = df[d].dropna().count()
    x = gs[d].count() / rtgs
    x.plot(ax = axes[i], kind='bar', ylim=(0,.5), rot=0)
    axes[i].set_title(d)
    axes[i].set_xlabel("Rating Value")

axes[0].set_ylabel("Proportion of Ratings")
axes[0].set_title("Complete")
axes[1].set_title("Objective")
axes[2].set_title("Trustworthy")
axes[3].set_title("Well-written")

fig.suptitle('Distribution of Each Rating Dimension', va='bottom')
Out[22]:
<matplotlib.text.Text at 0x1028ec850>

Are dimensions correlated? We already know that a lot of users give all 5s. But perhaps some dimensions are more correlated with others?

In [23]:
for method in ['pearson', 'spearman']:
    print "%s Correlation" % method
    print df[dims].corr(method)
pearson Correlation
           comp       obj     trust      writ
comp   1.000000  0.715450  0.749500  0.764108
obj    0.715450  1.000000  0.774930  0.738221
trust  0.749500  0.774930  1.000000  0.753312
writ   0.764108  0.738221  0.753312  1.000000

[4 rows x 4 columns]
spearman Correlation
           comp       obj     trust      writ
comp   1.000000  0.704479  0.735459  0.759818
obj    0.704479  1.000000  0.751640  0.703651
trust  0.735459  0.751640  1.000000  0.725299
writ   0.759818  0.703651  0.725299  1.000000

[4 rows x 4 columns]

As we might expect, "objective" is correlated most highly with "trustworthy". Perhaps less expected, "complete" and "well-written" are also more correlated with each other than they are with the other dimensions. These are both true whether we look at normal Pearson correlation or whether we use Spearman's rho rank correlation coefficient. Still, all dimmensions are highly (positively) correlated with each other.


2e - How consistent are ratings for particular pages/versions?

Ideally, ratings would cluster pretty closely for a particular page, and certainly for a particular version. If ratings are truly meaningful to the community as a whole (vs. to a certain individual or a small group), then users should be able to come to some sort of consensus about the quality of a particular page. We will explore this issue more deeply in section 3 (Page Rating Histories), but here we can look at some general statistics.

We'll start by looking at the distribution of ratings around the mean rating. First, I'll focus on ratings that have all four dimensions, since I suspect that such ratings will be of higher quality (on the assumption that someone rating all four dimensions is more likely to actually consider all dimensions -- though of course it is also possible that people who rate the same across the board are actually bringing less consideration to the process).

We will also only look at pages with at least 3 ratings, and then pages with at least 50 ratings, and finally with at least 300 ratings.

Group ratings by page and by version (revision), and calculate the mean and median of rating_all_mean, as well as the count of ratings with all dimensions rated.

NB: rating_all_mean is the mean of across all rating dimensions, while page_all_mean (or page_comp_mean, etc.) is the mean of rating_all_mean (or comp, etc.) across all observations for each page.

In [24]:
p_ratings = df.groupby('page_title')
df['page_all_mean'] = p_ratings.rating_all_mean.transform('mean') # the per-page mean of `rating_all_mean`
df['page_all_median'] = p_ratings.rating_all_mean.transform('median') # the per-page median of `rating_all_mean`
df['page_all_rate_count'] = p_ratings.rating_all_mean.transform('count') # the per-page count of non-null `rating_all_mean`
                                                                        # i.e. the count of rows with all 4 dimensions
for dim in dims: # means for each rating dimension (comp/trust/obj/writ)
    df['page_%s_mean' % dim] = p_ratings[dim].transform('mean')
    #df['page_%s_median' % dim] = p_ratings[dim].transform('median')
In [25]:
# Do same as above, but this time grouping by VERSION (aka revision)
v_ratings = df.groupby('rev_id')
df['rev_all_mean'] = v_ratings.rating_all_mean.transform('mean')
df['rev_all_median'] = v_ratings.rating_all_mean.transform('median')
df['rev_all_rate_count'] = v_ratings.rating_all_mean.transform('count')
for dim in dims:
    df['rev_%s_mean' % dim] = v_ratings[dim].transform('mean')
    #df['rev_%s_median' % dim] = v_ratings[dim].transform('median')

For each page and version, compute difference variables for:

  • Difference between a particular rating value (either rating_all_mean or a single rating dimension value) and
    • the mean for all observations for that page/version (with all 4 rating dimensions)
    • the median for all observations for that page/version (with all 4 rating dimensions)
  • And, the absolute value of those difference variables (so we can comute the average absolute deviation, etc.)
In [26]:
for unit in ['page', 'rev']:
    for avg in ['mean', 'median']:
        df['diff_%s_all_%s' % (unit, avg)] = df.rating_all_mean - df['%s_all_%s' % (unit, avg)]
        df['absdif_%s_all_%s' % (unit, avg)] = abs(df.rating_all_mean - df['%s_all_%s' % (unit, avg)])
        if avg == 'mean':
            for dim in dims:
                df['diff_%s_%s_%s' % (unit, dim, avg)] = df[dim] - df['%s_%s_%s' % (unit, dim, avg)]
                df['absdif_%s_%s_%s' % (unit, dim, avg)] = abs(df[dim] - df['%s_%s_%s' % (unit, dim, avg)])

Let's look first at the mean absolute deviations (MADs) for each page/version, using both mean and median as our measure of central tendency.

In [27]:
desc(df[df.page_all_rate_count >= 2].filter(regex='absdif_page.*')).sort()
Out[27]:
count mean std min 25% 50% 75% max
absdif_page_all_mean 49712 0.956 0.722 0.000 0.395 0.820 1.333 3.640
absdif_page_all_median 49712 0.905 0.881 0.000 0.250 0.625 1.250 4.000
absdif_page_comp_mean 55500 1.126 0.759 0.000 0.500 1.059 1.624 3.667
absdif_page_obj_mean 53265 1.103 0.780 0.000 0.500 1.000 1.500 3.809
absdif_page_trust_mean 56986 1.117 0.787 0.000 0.500 0.981 1.509 3.833
absdif_page_writ_mean 60508 1.040 0.762 0.000 0.500 0.893 1.418 3.820

6 rows × 8 columns

In [28]:
desc(df[df.rev_all_rate_count >= 2].filter(regex='absdif_rev.*')).sort()
Out[28]:
count mean std min 25% 50% 75% max
absdif_rev_all_mean 34515 0.817 0.671 0.000 0.275 0.649 1.200 3.784
absdif_rev_all_median 34515 0.758 0.829 0.000 0.125 0.500 1.125 4.000
absdif_rev_comp_mean 37403 0.960 0.739 0.000 0.400 0.826 1.500 3.682
absdif_rev_obj_mean 36317 0.933 0.755 0.000 0.333 0.750 1.400 3.875
absdif_rev_trust_mean 38216 0.958 0.766 0.000 0.333 0.800 1.484 3.818
absdif_rev_writ_mean 39920 0.902 0.735 0.000 0.333 0.722 1.333 3.818

6 rows × 8 columns

So, the spread of ratings within each version is smaller than within each page, which makes sense. Still, the spread is relatively large, with each dimension, for example, being, on average, almost a whole point away from the mean. For cases where there are 2 ratings for a certain version, this means that, on average, those ratings are separated by nearly 2 whole points, since their mean is their midpoint: a 5 and a 3, say, or a 4 and a 2.

What happens when we change the cutoff?

First, let's look at the distribution of the counts of pages and versions with all dimensions rated.

In [29]:
desc(df.filter(like='all_rate_count'), float_decimals=1, percentiles=[5,10,25,50,75,90,95]).sort().T
Out[29]:
page_all_rate_count rev_all_rate_count
count 78524 78524
mean 172.5 5.4
std 437.4 12.7
min 0.0 0.0
5% 1.0 0.0
10% 2.0 0.0
25% 6.0 1.0
50% 22.0 2.0
75% 93.0 5.0
90% 313.0 11.0
95% 1304.0 22.0
max 2110.0 141.0

12 rows × 2 columns

In [30]:
desc(pd.DataFrame(df[df.rev_all_rate_count >= 2].groupby('rev_id').rev_all_rate_count.first()), float_decimals=1, percentiles=[5,10,25,50,75,90,95]).sort().T
Out[30]:
rev_all_rate_count
count 9544
mean 3.6
std 4.4
min 2.0
5% 2.0
10% 2.0
25% 2.0
50% 2.0
75% 4.0
90% 6.0
95% 8.0
max 141.0

12 rows × 1 columns

Now, I'll plot the mean absolute deviations across cutoffs.

In [31]:
from matplotlib.ticker import FuncFormatter

lines = [] # list of SERIES of points, one for each of the lines illustrating change in AAD/MAD; 
n = [] # series of counts of pages satisfying cutoff
count = len(df) # total obs. count
cutoffs = range(2,252)

# For each cutoff, create an entry in the lines list and the N list, the first corresponding
# to a Series with the means for each of the variables when the sample is restricted to that
# cutoff.
for cutoff in cutoffs:
    # Add a series to the lines list
    lines.append(df[df.page_all_rate_count >= cutoff].filter(regex='absdif_page.*').mean())
    # Add the percentage of the sample cover to the n list
    n.append(100 * len(df[df.page_all_rate_count >= cutoff])/float(len(df)))

mad = pd.DataFrame(lines, index=cutoffs) # concat all the Series together, indexing by the cutoffs list
n_line = pd.Series(n, name='n', index=cutoffs) # combine all the points in the list to a Series
In [32]:
fig, ax = plt.subplots(figsize=(10,6))
ax1 = mad.plot(ax=ax, linewidth=2, sort_columns=True,
         title="Change in Mean Absolute Deviation based on Minimum Number of Ratings per Page",
         style=['k:','c:','r-','g-','m-','b-'],
         alpha=.8)
ax1.yaxis.set_label_text('Mean Absolute Deviation')
ax1.xaxis.set_label_text('Minimum Number of Ratings per Page')
ax1.legend(['MAD from Page Mean -\n Mean of all rating dimensions', 
            'MAD from Page Median -\n Mean of all rating dimensions',
            'MAD from Page Mean -\n "Complete"',
            'MAD from Page Mean -\n "Objective"',
            'MAD from Page Mean -\n "Trustworthy"',
            'MAD from Page Mean -\n "Well-Written"'],
           loc="upper right",
           bbox_to_anchor=(1.55,1))

ax2 = n_line.plot(ax=ax, secondary_y=True, label="% of Ratings in Sample (right axis)", alpha=.5, style='k--', )
ax2.yaxis.set_label_text('% of Ratings Satisfying Cutoff', rotation=-90)
ax2.yaxis.labelpad = 12
def pct_fmt(x, pos=0): 
    return '%d%%'%(x)
ax2.yaxis.set_major_formatter(FuncFormatter(pct_fmt))
ax2.legend(loc='lower left')
Out[32]:
<matplotlib.legend.Legend at 0x10d164f10>
In [33]:
l = []
n = []
count = len(df)
cutoffs = range(2,52)
for cutoff in cutoffs:
    l.append(df[df.rev_all_rate_count >= cutoff].filter(regex='absdif_rev.*').mean())
    n.append(100 * len(df[df.rev_all_rate_count >= cutoff])/float(len(df)))

aad = pd.DataFrame(l, index=cutoffs)
n_line = pd.Series(n, name='n', index=cutoffs)

fig, ax = plt.subplots(figsize=(10,8))
ax1 = aad.plot(ax=ax, linewidth=2, sort_columns=True,
         title="Change in Average Absolute Deviation based on Minimum Number of Ratings per Version")
ax1.yaxis.set_label_text('Average Absolute Deviation')
ax1.xaxis.set_label_text('Cutoff')
ax1.legend(loc="upper right")

ax2 = n_line.plot(ax=ax, secondary_y=True, label="N (right axis)", alpha=.5, style='k--', )
ax2.yaxis.set_label_text('% of Ratings Satisfying Cutoff', rotation=-90)
ax2.yaxis.labelpad = 16
def pct_fmt(x, pos=0): 
    return '%d%%'%(x)
ax2.yaxis.set_major_formatter(FuncFormatter(pct_fmt))
ax2.legend(loc='lower left')
Out[33]:
<matplotlib.legend.Legend at 0x10d1a6350>

It appears that restricting our sample to pages/versions with more observations initially increases then steadily decreases the spread of values around the mean/median. (More on what that might mean below.)

Here are the same statistics as presented in the above tables, only now for the non-absolute differences:

In [34]:
desc(df[df.page_all_rate_count >= 2].filter(like='diff_page')).sort()
Out[34]:
count mean std min 25% 50% 75% max
diff_page_all_mean 49712 0.000 1.198 -3.640 -0.667 0.226 0.917 3.357
diff_page_all_median 49712 -0.256 1.237 -4.000 -0.750 0.000 0.500 4.000
diff_page_comp_mean 55500 0.000 1.358 -3.667 -0.938 0.200 1.086 3.556
diff_page_obj_mean 53265 -0.000 1.351 -3.809 -0.800 0.333 1.023 3.643
diff_page_trust_mean 56986 0.000 1.367 -3.833 -0.867 0.335 1.000 3.333
diff_page_writ_mean 60508 0.000 1.289 -3.820 -0.828 0.265 0.900 3.333

6 rows × 8 columns

In [35]:
desc(df[df.rev_all_rate_count >= 2].filter(like='diff_rev')).sort()
Out[35]:
count mean std min 25% 50% 75% max
diff_rev_all_mean 34515 0.000 1.057 -3.784 -0.600 0.083 0.694 3.543
diff_rev_all_median 34515 -0.139 1.115 -4.000 -0.625 0.000 0.500 4.000
diff_rev_comp_mean 37403 0.000 1.212 -3.682 -0.750 0.000 0.889 3.500
diff_rev_obj_mean 36317 -0.000 1.200 -3.818 -0.667 0.000 0.833 3.875
diff_rev_trust_mean 38216 -0.000 1.227 -3.818 -0.667 0.000 0.833 3.692
diff_rev_writ_mean 39920 -0.000 1.163 -3.818 -0.667 0.000 0.750 3.333

6 rows × 8 columns

Note that the mean column here is not meaningful (except perhaps for the diff_*_all_median rows), since the mean of the difference from the mean is, of course, 0. Of greater interest is the interquartile range, which shows a consistent rightward skew, with the exception of the differences based on median.

Let's explore these distributions further with some histograms at various cutoffs.

In [36]:
cols = df.filter(like='diff_page').columns.values # list of columns starting with 'diff_page'
cols.sort()

for cutoff in [2, 93, 313]:
    # Restrict data to:
    # - columns of interest
    # - rows that have all rating dimensions
    # - only for pages with number of ratings above cutoff
    d = df.loc[(df.rating_all_mean.notnull()) & (df.page_all_rate_count >= cutoff), cols]
    
    # Create subplot for each col variable
    fig, axes = plt.subplots(nrows=1, ncols=len(cols), figsize=(4*len(cols), 5), sharey=True)

    # Loop through the columns, plot a histogram and also a kernal density plot on top
    for i, var in enumerate(cols):
        d[var].dropna().hist(ax=axes[i],
                                normed=True,
                                bins=math.log(d[var].count()),
                                color='lightskyblue',
                                alpha=0.7)
        d[var].dropna().plot(ax=axes[i],
                                kind='kde',
                                linewidth=2,
                                xlim=(-5,5),
                                ylim=(0,1),
                                title=var)
    fig.suptitle("Distribution of rating deviation from page-wide mean/median - At least %d ratings with all dimensions" % cutoff, 
                 size='large', weight='bold', va='bottom')

So, all of these plots show a rightward skew, meaning that more people rate slightly higher than the mean, while some people rate significantly lower than the mean. This makes sense given the fact that the mean rating is fairly high and the rating scale is truncated.

The more meaningful factor is the spread, giving an indication of how consistent the ratings are. There is a consistent bimodal pattern here, which could indicate that ratings are not, in fact, centered on a single mean. There may be, say, two types of people rating articles, with each type having significantly different evaluation metrics and models.

Another aspect to note is that pages with greater numbers of ratings do not seem to lose this bimodality; if anything, the bimodality is strengthened, indicating that it is not a problem of low N, but rather a consistent pattern. In fact, it is even possible that articles with many ratings are more controversial, meaning a greater variance (indicated in this case by bimodality).

Looking at versions (using a different set of cutoffs), we see similar patterns:

In [37]:
cols = df.filter(like='diff_rev').columns.values
cols.sort()
cols
Out[37]:
array(['diff_rev_all_mean', 'diff_rev_all_median', 'diff_rev_comp_mean',
       'diff_rev_obj_mean', 'diff_rev_trust_mean', 'diff_rev_writ_mean'], dtype=object)
In [38]:
titles = {'diff_rev_all_mean': 'Deviation from Version Mean \n Mean of All Dim.',
          'diff_rev_all_median': 'Deviation from Version Median \n Mean of All Dim.',
          'diff_rev_comp_mean': 'Deviation from Version Mean \n "Complete"',
          'diff_rev_obj_mean': 'Deviation from Version Mean \n "Objective"',
          'diff_rev_trust_mean': 'Deviation from Version Mean \n "Trustworthy"',
          'diff_rev_writ_mean': 'Deviation from Version Mean \n "Well-Written"'}

for cutoff in [2, 4, 6]:
    d = df.loc[(df.rating_all_mean.notnull()) & (df.rev_all_rate_count >= cutoff), cols]
    fig, axes = plt.subplots(nrows=1, ncols=len(cols), figsize=(4*len(cols), 5), sharey=False)

    for i, var in enumerate(cols):
        d[var].dropna().hist(ax=axes[i],
                                normed=True,
                                bins=math.log(d[var].count()),
                                color='lightskyblue',
                                alpha=0.7)
        d[var].dropna().plot(ax=axes[i],
                                kind='kde',
                                linewidth=2,
                                xlim=(-5,5),
                                ylim=(0,1))
        axes[i].set_title(titles[var], size='x-large')
        axes[i].set_ylabel("", size='x-large')
        axes[i].tick_params(axis='both', which='major', labelsize='x-large')
    axes[0].set_ylabel("Density")
    fig.suptitle("Distribution of rating deviation from version-wide mean/median - "
                 "At least %d ratings with all dimensions per version" % cutoff, 
                 size='xx-large', weight='bold', va='bottom', y=1)

2f - Do users who are logged in rate pages differently?

If we assume that logged-in users are more savvy about Wikipedia standards and the stated goals of each page, then it might be true that such logged-in users would have more "accurate" (in the sense of reflecting the a priori standards of Wikipedia) ratings. They might also be expected to have ratings that are more closely linked to actual knowledge about the topics.

Let's plot the differences in the distributions of ratings based on whether the user is logged in or not.

In [39]:
titles = {'rating_all_mean': 'Mean,\n All Dim. Rated',
          'rating_any_mean': 'Mean,\n Any Dim. Rated',
          'comp': '"Complete"',
          'obj': '"Objective"',
          'trust': '"Trustworthy"',
          'writ': '"Well-Written"'}

#bins = [1,1.5,2,2.5,3,3.5,4,4.5,5,5.5]
#bins = [0.75,1.25, 1.75, 2.25, 2.75, 3.25, 3.75, 4.25, 4.75, 5.25]
bins=8
pcols = ['rating_all_mean', 'rating_any_mean'] # plot columns
dims = ['comp', 'obj', 'trust', 'writ'] # dimensions of rating
pcols.extend(dims)

fig, axes = plt.subplots(nrows=2, ncols=len(pcols), figsize=(15, 6), sharex=False, sharey=False)

login_gs = df.groupby('logged_in')

# For each row (key, data) in the logged_in-groups
for r, (k, d) in enumerate(login_gs.groups.items()):
    
    # for each column, variable
    for c, var in enumerate(pcols):
        
        # For continuous variables (with >10 unique values), make histogram
        if len(df.loc[d, var].unique()) > 10:
            df.loc[d, var].hist(ax=axes[r, c], normed=True, bins=bins, align='mid')
            if r == 0:
                axes[r,c].set_title(titles[var], size='large')
                
        # For discrete variables, just plot frequency of each option
        else:
            gs = df.loc[d].groupby(var) # group by the variable
            rtgs = df.loc[d, var].dropna().count()
            x = gs[var].count() / rtgs
            x.plot(ax = axes[r, c], kind='bar', ylim=(0,.5), rot=0)
            if r == 0:
                axes[r, c].set_title(titles[var], size='x-large')
            axes[r, c].set_xlabel("")
                
        # If first column, set y-label to logged in value
        if c == 0:
            axes[r, c].set_ylabel("Logged-In = %d" % k)

fig.suptitle('Proportion of Each Rating, Based on Logged-in Status', size='x-large', va='bottom')
fig.tight_layout()

So, logged-in users appear to be less likely to give ratings of 1.

In [40]:
print "Logged in"
print desc(df[(df.page_all_rate_count >= 2) & (df.logged_in==1)].filter(regex='absdif_page.*')).sort()
print "Not logged in"
print desc(df[(df.page_all_rate_count >= 2) & (df.logged_in==0)].filter(regex='absdif_page.*')).sort()
Logged in
                        count   mean    std    min    25%    50%    75%    max
absdif_page_all_mean     1351  0.786  0.649  0.000  0.257  0.667  1.150  3.567
absdif_page_all_median   1351  0.710  0.782  0.000  0.125  0.500  1.000  4.000
absdif_page_comp_mean    1443  0.931  0.716  0.000  0.333  0.867  1.369  3.537
absdif_page_obj_mean     1408  0.920  0.723  0.000  0.351  0.807  1.275  3.439
absdif_page_trust_mean   1459  0.919  0.743  0.000  0.333  0.800  1.266  3.780
absdif_page_writ_mean    1508  0.840  0.693  0.000  0.333  0.726  1.143  3.667

[6 rows x 8 columns]
Not logged in
                        count   mean    std    min    25%    50%    75%    max
absdif_page_all_mean    48361  0.961  0.724  0.000  0.400  0.829  1.339  3.640
absdif_page_all_median  48361  0.910  0.883  0.000  0.250  0.625  1.250  4.000
absdif_page_comp_mean   54057  1.131  0.759  0.000  0.523  1.067  1.634  3.667
absdif_page_obj_mean    51857  1.108  0.781  0.000  0.500  1.000  1.510  3.809
absdif_page_trust_mean  55527  1.123  0.788  0.000  0.502  0.993  1.524  3.833
absdif_page_writ_mean   59000  1.045  0.763  0.000  0.500  0.893  1.429  3.820

[6 rows x 8 columns]

The average absolute difference for ratings across pages is lower for users that are logged in.

In [41]:
print "Logged in"
print desc(df[(df.rev_all_rate_count >= 2) & (df.logged_in==1)].filter(regex='absdif_rev.*')).sort()
print "Not logged in"
print desc(df[(df.rev_all_rate_count >= 2) & (df.logged_in==0)].filter(regex='absdif_rev.*')).sort()
Logged in
                       count   mean    std    min    25%    50%    75%    max
absdif_rev_all_mean      629  0.688  0.619  0.000  0.212  0.500  1.000  3.391
absdif_rev_all_median    629  0.648  0.729  0.000  0.000  0.500  1.000  4.000
absdif_rev_comp_mean     667  0.809  0.719  0.000  0.250  0.600  1.212  3.391
absdif_rev_obj_mean      649  0.806  0.718  0.000  0.250  0.600  1.333  3.357
absdif_rev_trust_mean    661  0.783  0.722  0.000  0.125  0.500  1.250  3.696
absdif_rev_writ_mean     699  0.763  0.679  0.000  0.250  0.600  1.000  3.346

[6 rows x 8 columns]
Not logged in
                       count   mean    std    min    25%    50%    75%    max
absdif_rev_all_mean    33886  0.819  0.672  0.000  0.281  0.650  1.208  3.784
absdif_rev_all_median  33886  0.761  0.831  0.000  0.125  0.500  1.125  4.000
absdif_rev_comp_mean   36736  0.963  0.739  0.000  0.400  0.833  1.500  3.682
absdif_rev_obj_mean    35668  0.935  0.756  0.000  0.333  0.750  1.400  3.875
absdif_rev_trust_mean  37555  0.961  0.767  0.000  0.333  0.800  1.500  3.818
absdif_rev_writ_mean   39221  0.904  0.736  0.000  0.333  0.727  1.333  3.818

[6 rows x 8 columns]

The average absolute difference for ratings across versions is also lower for users that are logged in. Let's do some t-test to see if this difference in means is significant.

In [42]:
cols = df.filter(like="absdif").columns.values
cols.sort()
for var in cols:
    print ttest_group(df.loc[(df.rev_all_rate_count >= 2), [var, 'logged_in']].dropna(), 
                by='logged_in', var=var)
absdif_page_all_mean t stat.       p-value       
logged_in                  0     1       0      1
0                          x  5.32       x  0.000
1                      -5.32     x   0.000      x

[2 rows x 4 columns]
absdif_page_all_median t stat.       p-value       
logged_in                    0     1       0      1
0                            x  4.74       x  0.000
1                        -4.74     x   0.000      x

[2 rows x 4 columns]
absdif_page_comp_mean t stat.       p-value       
logged_in                   0     1       0      1
0                           x  6.14       x  0.000
1                       -6.14     x   0.000      x

[2 rows x 4 columns]
absdif_page_obj_mean t stat.       p-value       
logged_in                  0     1       0      1
0                          x  4.68       x  0.000
1                      -4.68     x   0.000      x

[2 rows x 4 columns]
absdif_page_trust_mean t stat.       p-value       
logged_in                    0     1       0      1
0                            x  6.01       x  0.000
1                        -6.01     x   0.000      x

[2 rows x 4 columns]
absdif_page_writ_mean t stat.       p-value       
logged_in                   0     1       0      1
0                           x  6.09       x  0.000
1                       -6.09     x   0.000      x

[2 rows x 4 columns]
absdif_rev_all_mean t stat.       p-value       
logged_in                 0     1       0      1
0                         x  5.27       x  0.000
1                     -5.27     x   0.000      x

[2 rows x 4 columns]
absdif_rev_all_median t stat.       p-value       
logged_in                   0     1       0      1
0                           x  3.82       x  0.000
1                       -3.82     x   0.000      x

[2 rows x 4 columns]
absdif_rev_comp_mean t stat.       p-value       
logged_in                  0     1       0      1
0                          x  5.47       x  0.000
1                      -5.47     x   0.000      x

[2 rows x 4 columns]
absdif_rev_obj_mean t stat.       p-value       
logged_in                 0     1       0      1
0                         x  4.54       x  0.000
1                     -4.54     x   0.000      x

[2 rows x 4 columns]
absdif_rev_trust_mean t stat.       p-value       
logged_in                   0     1       0      1
0                           x  6.26       x  0.000
1                       -6.26     x   0.000      x

[2 rows x 4 columns]
absdif_rev_writ_mean t stat.       p-value       
logged_in                  0     1       0      1
0                          x  5.44       x  0.000
1                      -5.44     x   0.000      x

[2 rows x 4 columns]

So, the data appears to support the notion that logged-in users produce more reliable (or, at least, consistent) ratings.

3. Page Rating History

Before we can look at the page rating history, we need a sense of a "version" that is consistent across pages. The most sensible solution at this point (since we haven't merged in actual revision data from the Wiki DB) is to use nth_version. Thus, each rating can be seen as a piece of information about a particular version, with the versions themselves being the true "observations".

First we merge in the RPV values for each rating.

In [43]:
df = pd.merge(left=df, right=pd.DataFrame(rpv, columns=['rpv']), left_on='page_title', right_index=True)
df = df.drop('Unnamed: 0', 1)
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 78524 entries, 0 to 78523
Data columns (total 54 columns):
rev_id                    78524 non-null float64
nth_rating                78524 non-null int64
page_id                   78524 non-null int64
page_title                78524 non-null object
logged_in                 78524 non-null float64
datetime                  78524 non-null object
comp                      60263 non-null float64
obj                       57496 non-null float64
trust                     61993 non-null float64
writ                      65568 non-null float64
rating_dim_count          77737 non-null float64
rating_any_mean           77737 non-null float64
rating_all_mean           53303 non-null float64
page_rate_count           78524 non-null int64
rev_rate_count            78524 non-null float64
page_comp_mean            77283 non-null float64
rev_comp_mean             72232 non-null float64
page_obj_mean             76953 non-null float64
rev_obj_mean              71014 non-null float64
page_trust_mean           77490 non-null float64
rev_trust_mean            72943 non-null float64
page_writ_mean            77450 non-null float64
rev_writ_mean             74095 non-null float64
page_all_mean             76417 non-null float64
page_all_median           76417 non-null float64
page_all_rate_count       78524 non-null float64
rev_all_mean              69028 non-null float64
rev_all_median            69028 non-null float64
rev_all_rate_count        78524 non-null float64
diff_page_all_mean        53303 non-null float64
absdif_page_all_mean      53303 non-null float64
diff_page_comp_mean       60263 non-null float64
absdif_page_comp_mean     60263 non-null float64
diff_page_obj_mean        57496 non-null float64
absdif_page_obj_mean      57496 non-null float64
diff_page_trust_mean      61993 non-null float64
absdif_page_trust_mean    61993 non-null float64
diff_page_writ_mean       65568 non-null float64
absdif_page_writ_mean     65568 non-null float64
diff_page_all_median      53303 non-null float64
absdif_page_all_median    53303 non-null float64
diff_rev_all_mean         53303 non-null float64
absdif_rev_all_mean       53303 non-null float64
diff_rev_comp_mean        60263 non-null float64
absdif_rev_comp_mean      60263 non-null float64
diff_rev_obj_mean         57496 non-null float64
absdif_rev_obj_mean       57496 non-null float64
diff_rev_trust_mean       61993 non-null float64
absdif_rev_trust_mean     61993 non-null float64
diff_rev_writ_mean        65568 non-null float64
absdif_rev_writ_mean      65568 non-null float64
diff_rev_all_median       53303 non-null float64
absdif_rev_all_median     53303 non-null float64
rpv                       78524 non-null float64
dtypes: float64(49), int64(3), object(2)

Now we need to create the nth_version variable to index versions.

In [44]:
versions = df[['page_title','rev_id','nth_rating']].groupby(['page_title', 'rev_id']).first().reset_index()
versions['nth_version'] = versions.groupby(['page_title']).cumcount() + 1

Now merge versions into the main dataframe.

In [45]:
df = pd.merge(left=df, right=versions.drop('nth_rating', 1), left_on=['page_title', 'rev_id'], right_on=['page_title', 'rev_id'])

Definition of FEFR Sample

To look more closely at the data, we will examine frequently edited, frequently rated (FEFR) pages:

  • with at least 5 rated versions (i.e. "revisions" or edits)
  • with the highest ratings per rated version

These pages have some amount of history, with a fair amount of information for each version in the history. Note, however, that both the number of rated versions (i.e. versions that are in our data) and the ratings per rated version (RPV) are not necessarily exogenous; in other words, it is quite possible that pages with many versions (and thus many edits), or high RPV, may be different in key and thusfar unobserved ways from other pages. For example, these pages might be more controversial; we can imagine that controversial pages would cause people to rate pages they strongly disagree with and also to edit those pages, thus affecting both measures I use to define the sample. However, it is difficult, for example, to have confidence in the mean rating if there are only a handful of ratings for a particular page version. Thus, I'll leave the investigation of how representative these frequently edited, frequently rated pages are of Wikipedia pages in general until a later stage, when I have incorporated data from the Wiki DB itself.

In [46]:
df['page_rev_count'] = df.groupby('page_title').nth_version.transform('max')
In [47]:
fe_rpv = df[df.page_rev_count >= 5].groupby('page_title').rpv.first()
fe_rpv.sort(ascending=False)
fefr = df.set_index('page_title').loc[fe_rpv.head(20).index].reset_index()

Here is a list of the 20 FEFR pages:

In [48]:
fefr.sort(['page_title', 'nth_version', 'nth_rating'])
sorted_fefr = fefr.groupby('page_title')[['page_rev_count', 'page_rate_count', 'rpv']].mean().sort('page_rev_count')
sorted_fefr
Out[48]:
page_rev_count page_rate_count rpv
page_title
Alan_Questel 5 116 23.200000
Excess-3 5 66 13.200000
Acidic_oxide 6 53 8.833333
Partnership_accounting 6 57 9.500000
Zung_Self-Rating_Anxiety_Scale 6 47 7.833333
Multitenancy 7 65 9.285714
Singaporean_national_referendum,_1962 7 56 8.000000
Carbon_copy 7 110 15.714286
Game_server 8 73 9.125000
Ornamental_plant 9 77 8.555556
Damp_proofing 10 77 7.700000
Pulse_Polio 10 84 8.400000
Pen_pal 14 126 9.000000
Homework 18 203 11.277778
Implied_powers 21 170 8.095238
Business_development 26 408 15.692308
Ultra-high-definition_television 35 272 7.771429
Arithmetic_progression 40 442 11.050000
List_of_spells_in_Harry_Potter 42 1685 40.119048
United_States_Declaration_of_Independence 52 439 8.442308

20 rows × 3 columns

In [49]:
var = 'rating_any_mean'
axes = []
for title in sorted_fefr.index[0:]:
    d = fefr.loc[fefr.page_title==title, ['page_rev_count', var, 'nth_version']]
    prc = d.page_rev_count.mean()
    ax = d.boxplot(var, by='nth_version', figsize=(prc/3, 3), grid=False)
    #groups = d.groupby('nth_version').groups.keys()
    #keys = range(len(ax))
    #print keys
    #cl = ax.get_children()
    #cl=[item for item in cl if isinstance(item, matplotlib.lines.Line2D)]
    #print cl
    ax.set_ylim([1,5])
    ax.set_title(title, ha='left')
    plt.suptitle("")
    mean = fefr[fefr.page_title==title].groupby('nth_version')[var].mean()
    #median = fefr[fefr.page_title==title].groupby('nth_version')[var].median()
    mean.plot(ax=ax, style='bo-', linewidth=1, alpha=.8, mfc='none', grid=False)
    #median.plot(ax=ax, style='r--')
    axes.append(ax)
    
print "Page Rating History, by Version", "Mean is traced in blue"
Page Rating History, by Version Mean is traced in blue

There does not appear to be a consistent improvement over time. Let's check this for the whole sample, first by looking at the correlation.

In [50]:
df[['rating_any_mean', 'rating_all_mean', 'comp', 'obj', 'trust', 'writ', 'nth_version']].corr()
Out[50]:
rating_any_mean rating_all_mean comp obj trust writ nth_version
rating_any_mean 1.000000 1.000000 0.906040 0.900340 0.919510 0.912760 0.063059
rating_all_mean 1.000000 1.000000 0.898970 0.895074 0.909228 0.902238 0.063983
comp 0.906040 0.898970 1.000000 0.715450 0.749500 0.764108 0.096140
obj 0.900340 0.895074 0.715450 1.000000 0.774930 0.738221 0.035822
trust 0.919510 0.909228 0.749500 0.774930 1.000000 0.753312 0.051972
writ 0.912760 0.902238 0.764108 0.738221 0.753312 1.000000 0.049783
nth_version 0.063059 0.063983 0.096140 0.035822 0.051972 0.049783 1.000000

7 rows × 7 columns

Indeed, there is very little correlation at all between nth_version and any of the ratings dimensions/averages, indicating that according to the page ratings, there is little indication that ratings improve significantly with additional edits, all other things being equal. Note that, in order to not resort to ceteris paribus (all other things being equal), i.e. in order to actually look at the effects of different kinds of edits, I'll need to bring in data on the actual edits, using MediaWiki's API.

Import From Wiki API

In [85]:
from wikitools import wiki, api
site = wiki.Wiki('http://en.wikipedia.org/w/api.php')
In [106]:
#fefr.sort(['page_title', 'rev_id'])
first_revs = fefr.groupby('page_title').rev_id.min()
last_revs = fefr.groupby('page_title').rev_id.max()
page_ids = fefr.groupby('page_title').page_id.first()
In [84]:
pd.DataFrame([first_revs, last_revs], index=['first', 'last']).T
Out[84]:
first last
page_title
Acidic_oxide 427061939 500154456
Alan_Questel 485856064 490388040
Arithmetic_progression 439795331 499085638
Business_development 439743024 501765668
Carbon_copy 439434682 498533853
Damp_proofing 432333340 496300260
Excess-3 434543512 474299975
Game_server 439839963 497489178
Homework 440714547 500086023
Implied_powers 420335952 501842696
List_of_spells_in_Harry_Potter 440252234 500539016
Multitenancy 438799944 502073454
Ornamental_plant 436098180 491862710
Partnership_accounting 440550672 479834488
Pen_pal 438856013 500418856
Pulse_Polio 441492532 479155548
Singaporean_national_referendum,_1962 395108968 484949665
Ultra-high-definition_television 441055808 498702845
United_States_Declaration_of_Independence 440924612 500818030
Zung_Self-Rating_Anxiety_Scale 430519224 480106228

20 rows × 2 columns

Set up the API call, and run it for every page. (For some reason, wikitools won't allow pulling

In [306]:
def get_revisions(title, rvstartid=None, rvendid=None, rvcontinue=None):
    """Get a list of revision json-objects for a page, starting at a certain rvstartid/rvcontinueid"""
    params = {'action':'query', 
          'titles': title,
          'prop': 'revisions',
          'rvprop': 'ids|flags|timestamp|user|userid|size|comment|parsedcomment|content|tags|flagged',
          'rvdir': 'newer',
          'rvstartid': rvstartid,
          'rvendid': rvendid,
          'rvlimit': 50}#,
          #'rvcontinue': rvcontinue}
    if rvcontinue != None:
        params['rvcontinue'] = rvcontinue
    request = api.APIRequest(site, params)
    result = request.query(querycontinue=False)
    return result
In [340]:
def get_page_revisions(title, startid, endid, pid):
    """
    Get all page revisions for a given page, starting at a startid and ending at endid.
    
    This will call get_revisions until the returned object contains no 'query-continue' element.
    
    It returns a list of revision json-objects.
    """
    query_res = get_revisions(title, startid, endid)
    page_res = query_res['query']['pages']
    if pid in page_res:
        #print page_res
        result = page_res[pid]['revisions']
    else:
        return []
    
    #print query_res.keys()
    while 'query-continue' in query_res:
        print "Continued at: %s" % query_res['query-continue']['revisions']['rvcontinue']
        if 'warnings' in query_res:
            print "Query warning: %s" % query_res['warnings']
        
        query_res = get_revisions(title, 
                                  startid, 
                                  endid, 
                                  query_res['query-continue']['revisions']['rvcontinue'])
        #print "Query keys: %s" % query_res.keys()
        page_res = query_res['query']['pages']
        
        if pid in page_res:
            result.extend(page_res[pid]['revisions'])
    
    return result

Go through each page title and get a list of revision objects. Then, create a dataframe from those objects, and put it in a dict (revs).

In [341]:
revs = {}
for title in first_revs.index:
    startid = first_revs[title]
    endid = last_revs[title]
    pid = str(page_ids[title])
    
    revs[title] = pd.DataFrame(get_page_revisions(title, startid, endid, pid))
    print "%s: %d" % (title, len(revs[title]))
Acidic_oxide: 8
Alan_Questel: 0
Continued at: 476778108
Arithmetic_progression: 77
Business_development: 42
Carbon_copy: 26
Damp_proofing: 21
Excess-3: 6
Game_server: 34
Continued at: 470412069
Homework: 58
Implied_powers: 41
Continued at: 471954482
List_of_spells_in_Harry_Potter: 75
Multitenancy: 9
Ornamental_plant: 19
Partnership_accounting: 8
Pen_pal: 27
Continued at: 450084450
Pulse_Polio: 75
Singaporean_national_referendum,_1962: 7
Ultra-high-definition_television: 0
Continued at: 445311436
Continued at: 449893631
Continued at: 471344690
United_States_Declaration_of_Independence: 192
Zung_Self-Rating_Anxiety_Scale: 7

In [342]:
# Concat all the revisions into a single dataframe, which we can merge in
revdf = pd.concat(revs.values())
In [1189]:
fefr.sort('page_rev_count', ascending=False)
foo = fefr.groupby('page_title', sort=True).page_title.first()
print foo
for title in foo:
    nversions = len(fefr.loc[fefr.page_title==title, 'nth_version'].unique())
    print title, nversions
    #ax = fefr[fefr.page_title==title].boxplot('rating_all_mean', by='nth_version', figsize=(15,3))
    #ax.set_ylim((1,5))
page_title
Acidic_oxide                                                              Acidic_oxide
Alan_Questel                                                              Alan_Questel
Arithmetic_progression                                          Arithmetic_progression
Business_development                                              Business_development
Carbon_copy                                                                Carbon_copy
Damp_proofing                                                            Damp_proofing
Excess-3                                                                      Excess-3
Game_server                                                                Game_server
Homework                                                                      Homework
Implied_powers                                                          Implied_powers
List_of_spells_in_Harry_Potter                          List_of_spells_in_Harry_Potter
Multitenancy                                                              Multitenancy
Ornamental_plant                                                      Ornamental_plant
Partnership_accounting                                          Partnership_accounting
Pen_pal                                                                        Pen_pal
Pulse_Polio                                                                Pulse_Polio
Singaporean_national_referendum,_1962            Singaporean_national_referendum,_1962
Ultra-high-definition_television                      Ultra-high-definition_television
United_States_Declaration_of_Independence    United_States_Declaration_of_Independence
Zung_Self-Rating_Anxiety_Scale                          Zung_Self-Rating_Anxiety_Scale
Name: page_title, dtype: object
Acidic_oxide 6
Alan_Questel 5
Arithmetic_progression 40
Business_development 26
Carbon_copy 7
Damp_proofing 10
Excess-3 5
Game_server 8
Homework 18
Implied_powers 21
List_of_spells_in_Harry_Potter 42
Multitenancy 7
Ornamental_plant 9
Partnership_accounting 6
Pen_pal 14
Pulse_Polio 10
Singaporean_national_referendum,_1962 7
Ultra-high-definition_television 35
United_States_Declaration_of_Independence 52
Zung_Self-Rating_Anxiety_Scale 6

In [1218]:
 
Out[1218]:
{u'794849': {u'ns': 0,
  u'pageid': 794849,
  u'revisions': [{u'parentid': 389194926, u'revid': 427061939},
   {u'parentid': 427061939, u'revid': 445967216},
   {u'parentid': 445967216, u'revid': 453935315},
   {u'parentid': 453935315, u'revid': 462203723},
   {u'parentid': 462203723, u'revid': 490775320},
   {u'parentid': 490775320, u'revid': 490775470},
   {u'parentid': 490775470, u'revid': 500154295},
   {u'parentid': 500154295, u'revid': 500154456}],
  u'title': u'Acidic oxide'}}

Open Questions

  • Are pages with a high edit velocity different than other pages?
  • Are ratings more/less reliable for pages with lots of ratings per version?
  • Are lists different than other pages?
    • Is completeness more relevant for lists, e.g.?