# Automatic discovery of community organizations for long-term package maintenance

In [None]:
%matplotlib inline

import sys
print(f'Python {sys.version}')

import IPython
print(f'IPython {IPython.__version__}')

print('\nLibraries:\n')

import matplotlib
import matplotlib.pyplot as plt
print(f'matplotlib {matplotlib.__version__}')

import numpy as np
print(f'numpy {np.__version__}')

import pandas as pd
from pandas.plotting import register_matplotlib_converters
print(f'pandas {pd.__version__}')

import requests
print(f'requests {requests.__version__}')

Python 3.7.4 (default, Jul  8 2019, 18:31:06) 
[GCC 7.4.0]
IPython 7.6.1

Libraries:

matplotlib 3.1.1
numpy 1.17.0
pandas 0.25.0
requests 2.22.0


In [None]:
api_token = ''

def send_rest_request(url):
    headers = {'Authorization': f'token {api_token}'}
    r = requests.get(url=url, headers=headers)
    r.raise_for_status() # Abort if unsuccessful request
    return r.json()

def send_graphql_request(query, variables):
    headers = {'Authorization': f'token {api_token}'}
    url = 'https://api.github.com/graphql'
    json = {'query':query, 'variables':variables}
    r = requests.post(url=url, json=json, headers=headers)
    r.raise_for_status() # Abort if unsuccessful request
    return r.json()

## Phase 1: get a preliminary list of organizations

GitHub only provide two APIs to get a list of organization: a REST endpoint that allows to get the full list, but requires many requests, given that there are more than 2,000,000 organizations on GitHub (https://developer.github.com/changes/2015-06-17-organizations-endpoint/) and given that this first type of request will only provide the list of organization logins and descriptions, but nothing more, or the Search API that is limited to browsing 1000 results.

We choose to use the second to limit the number of requests, but this imposes to find ways of querying for less than 1000 results at a time, using the limited filters that search queries provide.

Our first restriction will be to limit ourselves to organizations with at least 5 public repositories.
We are aware that this is an arbitrary restriction that will exclude community organizations that are just starting and have not yet reached that number.

Our second restriction will be to search by keywords.
We list as many keywords as we could think that could appear in the names or the descriptions of this type of organizations:

In [None]:
keywords = [
    # To add next time: 'addon', 'addons',
    'app', 'apps', 'application', 'applications',
    'care', 'caring',
    'collab', 'collaboration', 'collaborative',
    'collection', 'collective',
    'common', 'commons',
    'community',
    'component', 'components',
    # To add next time: 'contribs'
    'contrib', 'contribution', 'contributions', 'contributing',
    'distribute', 'distribution', 'distributions',
    'ecosystem', 'ecosystems',
    'extension', 'extensions',
    'gather',
    'give', 'giving',
    'group',
    'help', 'helper', 'helpers',
    'library', 'libraries',
    'maintain', 'maintainer', 'maintainers', 'maintenance', 'maintaining',
    'member', 'members',
    'module', 'modules',
    'open source',
    'org', 'organization',
    'package', 'packages',
    'participate', 'participant', 'participants', 'participation',
    'people',
    'place',
    'plugin', 'plugins',
    'projects',
    # Not project singular because that would give too many results
    # and this is not about organizations focused on a single project
    'quality',
    'repository', 'repositories',
    'reuse', 'reusable',
    'share', 'shared', 'sharing',
    'support', 'supporter', 'supporters', 'supporting',
    'together',
    # To add next time: tool, tools
    'unofficial',
    'user', 'users'
]
len(keywords)

75

For some keywords, this still gives too many results so we additionally partition using language filters:

In [None]:
language_filters = [
    'language:JavaScript',
    'language:Java',
    'language:Python',
    'language:PHP',
    'language:HTML',
    'language:C#',
    'language:C++',
    'language:C',
    'language:CSS',
    '-language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS'
]

In [None]:
query = '''
query searchOrganizations($query: String!,$cursor: String) {
  search(type:USER,query:$query, first: 50, after: $cursor) {
    userCount
    pageInfo {
      endCursor
      hasNextPage
    }
    nodes {
      ... on Organization {
        login
        name
        description
        websiteUrl
        membersWithRole {
          totalCount
        }
        repositories(first: 1, orderBy: {field: STARGAZERS, direction: DESC}) {
          totalCount
          nodes {
            stargazers {
              totalCount
            }
            assignableUsers {
              totalCount
            }
          }
        }
      }
    }
  }
}
'''

In [None]:
columns = [
    'name',
    'description',
    'url',
    'members', # Number of public members
    'repositories', # Number of public repositories
    'stars', # Number of stars of the most starred repository
    'collaborators' # Number of assignable users of the most starred repository
]

keyword_columns = list(map(lambda keyword: f'keyword {keyword}', keywords))

In [None]:
values = pd.DataFrame(columns=columns + keyword_columns).astype({
    'members': 'UInt32',
    'repositories': 'UInt32',
    'stars': 'UInt32',
    'collaborators': 'UInt32'
})

In [None]:
def paged_query(keyword, language=''):
    if keyword == 'repository' or keyword == 'user':
        exclude = 'NOT aur-archive'
    elif keyword == 'collaborative':
        exclude = 'NOT GITenberg'
    else:
        exclude = ''
    next_page = True
    cursor = None
    while next_page:
        searchQuery = f'type:organization repos:>=5 {keyword} {exclude} {language}'
        print(f'Search query: {searchQuery}')
        json = send_graphql_request(
            query,
            {'query': searchQuery, 'cursor': cursor}
        )
        search_json = json['data']['search']
        nb_results = search_json['userCount']
        if nb_results > 1000:
            raise ValueError('Query not restricted enough: more than 1000 results.')
        page_info = search_json['pageInfo']
        next_page = page_info['hasNextPage']
        cursor = page_info['endCursor']
        for node in search_json['nodes']:
            # Index
            login = node['login']
            # Fields
            name = node['name']
            values.loc[login, 'name'] = name
            values.loc[login, 'description'] = node['description']
            values.loc[login, 'url'] = node['websiteUrl']
            values.loc[login, 'members'] = node['membersWithRole']['totalCount']
            repos_json = node['repositories']
            repos_nb = repos_json['totalCount']
            values.loc[login, 'repositories'] = repos_nb
            if repos_nb > 0:
                repo_json = repos_json['nodes'][0]
                values.loc[login, 'stars'] = repo_json['stargazers']['totalCount']
                values.loc[login, 'collaborators'] = repo_json['assignableUsers']['totalCount']
            values.loc[login, f'keyword {keyword}'] = True

In [None]:
for keyword in keywords[60:]:
    try:
        paged_query(keyword)
        print(f'Now fetched a total number of {len(values)} organizations.')
    except ValueError:
        for language in language_filters:
            paged_query(keyword, language)
            print(f'Now fetched a total number of {len(values)} organizations.')

Search query: type:organization repos:>=5 repository NOT aur-archive 
Search query: type:organization repos:>=5 repository NOT aur-archive language:JavaScript
Search query: type:organization repos:>=5 repository NOT aur-archive language:JavaScript
Search query: type:organization repos:>=5 repository NOT aur-archive language:JavaScript
Search query: type:organization repos:>=5 repository NOT aur-archive language:JavaScript
Search query: type:organization repos:>=5 repository NOT aur-archive language:JavaScript
Search query: type:organization repos:>=5 repository NOT aur-archive language:JavaScript
Search query: type:organization repos:>=5 repository NOT aur-archive language:JavaScript
Search query: type:organization repos:>=5 repository NOT aur-archive language:JavaScript
Now fetched a total number of 27130 organizations.
Search query: type:organization repos:>=5 repository NOT aur-archive language:Java
Search query: type:organization repos:>=5 repository NOT aur-archive language:Java
S

Search query: type:organization repos:>=5 repositories  language:C++
Now fetched a total number of 28453 organizations.
Search query: type:organization repos:>=5 repositories  language:C
Search query: type:organization repos:>=5 repositories  language:C
Search query: type:organization repos:>=5 repositories  language:C
Now fetched a total number of 28457 organizations.
Search query: type:organization repos:>=5 repositories  language:CSS
Now fetched a total number of 28457 organizations.
Search query: type:organization repos:>=5 repositories  -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS
Search query: type:organization repos:>=5 repositories  -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS
Search query: type:organization repos:>=5 repositories  -language:JavaScript -language:Java -lan

Search query: type:organization repos:>=5 support  language:CSS
Search query: type:organization repos:>=5 support  language:CSS
Now fetched a total number of 31200 organizations.
Search query: type:organization repos:>=5 support  -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS
Search query: type:organization repos:>=5 support  -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS
Search query: type:organization repos:>=5 support  -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS
Search query: type:organization repos:>=5 support  -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS
Search query: typ

In [None]:
values.to_csv('community-organizations-phase-one.csv')

## Phase 2: filter the results and fetch more information

### Filter

We start with the organizations that we have fetched in phase 1.
We have fetched more than 32,000 organizations, which is close to 15% of all GitHub organizations with at least 5 repositories.

In [None]:
values = pd.read_csv('community-organizations-phase-one.csv', index_col=0, dtype={
    'members': 'UInt32',
    'repositories': 'UInt32',
    'stars': 'UInt32',
    'collaborators': 'UInt32'
})

  interactivity=interactivity, compiler=compiler, result=result)


Apparently, the search filters are not fully efficient because about 1% of our search results have less than 5 repositories (four of them even having zero repositories):

In [None]:
len(values[values['repositories'] < 5]) / len(values)

0.011839427760991552

In [None]:
values = values[values['repositories'] >= 5]

Because many members can decide to make their membership status private (and in fact this is even the default), the public members are just an inferiror bound on the actual number of members of an organization.
To estimate an upper bound on organization membership, we have also retrieved the number of assignable users of the most starred repository.

Assignable users are organization members with read access to the repository, or collaborators with write access specifically on this repository.
In theory, it is possible for an organization member to not be an assignable user, if the organization owners have changed the default member permissions from "read" to "none".
In this case, the number of public members of the organization could be larger than the number of assignable users on the most starred repository, but such situation is quite rare, it represents less than 3% of our dataset:

In [None]:
len(values[values['members'] > values['collaborators']]) / len(values)

0.021029641185647426

In most organizations, there are strictly more collaborators than public members:

In [None]:
np.median(values['collaborators'] - values['members'])

3.0

Most organizations are not community organizations.
Community organizations (at least established once) should have a strong membership.
Thus we select organizations with at least 10 public members or collaborators on the most starred repository.
This represents 25% of the remaining organizations:

In [None]:
len(values[(values['members'] >= 10) | (values['collaborators'] >= 10)]) / len(values)

0.23051482059282372

In [None]:
values = values[(values['members'] >= 10) | (values['collaborators'] >= 10)]

Most organizations do not maintain any popular projects. Community organization should host several popular projects. Stars are often used as a proxy for popularity on GitHub. It is especially relevant for libraries that are mainly targeted to other developers. We set an arbitrary low limit of 10 stars on the most starred project. This represents about 60% of the remaining organizations:

In [None]:
len(values[values['stars'] >= 10]) / len(values)

0.5929886302111532

In [None]:
values = values[values['stars'] >= 10]

In [None]:
len(values)

4381

### Fetch more information

For each organization in the remaining list, we fetch the creation date of the organization, and the number of repositories that were created before this date, as an under-approximation of the number of transferred repositories.
The GraphQL API allows us to batch requests and thus to have much fewer requests:

In [None]:
def build_graphql_query(imin):
    query = """
    query {
    """
    if imin + 40 < len(values):
        next_imin = imin + 40
        isup = next_imin
    else:
        next_imin = None
        isup = len(values)
    index = values.index[imin:isup]
    for i, owner in enumerate(index):
        query += """
        request%d: organization(login: "%s") {
          createdAt
        }
        """ % (i, owner)
    query += """
    }
    """
    return query, index, next_imin

def save_testorg_result(json, index):
    data = json['data']
    i = 0
    while f'request{i}' in data:
        result = data[f'request{i}']
        if result is None:
            print(f'Warning: {values.loc[index[i]].name} has been deleted')
        else:
            values.loc[index[i],'creation date'] = result['createdAt']
        i += 1

In [None]:
imin = 0
while imin is not None:
    sys.stdout.write(f'imin: {imin}\r')
    sys.stdout.flush()
    query, index, imin = build_graphql_query(imin)
    json = send_graphql_request(query, {})
    save_testorg_result(json, index)

imin: 4360

In [None]:
def build_graphql_query(imin):
    query = """
    query {
    """
    if imin + 40 < len(values):
        next_imin = imin + 40
        isup = next_imin
    else:
        next_imin = None
        isup = len(values)
    index = values.index[imin:isup]
    for i, owner in enumerate(index):
        query += """
        request%d: search(query: "user:%s created:<%s", type: REPOSITORY) {
          repositoryCount
        }
        """ % (i, owner, values.loc[owner, 'creation date'])
    query += """
    }
    """
    return query, index, next_imin

def save_testorg_result(json, index):
    data = json['data']
    i = 0
    while f'request{i}' in data:
        result = data[f'request{i}']
        values.loc[index[i],'transferred repositories'] = result['repositoryCount']
        i += 1

In [None]:
imin = 0
while imin is not None:
    sys.stdout.write(f'imin: {imin}\r')
    sys.stdout.flush()
    query, index, imin = build_graphql_query(imin)
    json = send_graphql_request(query, {})
    save_testorg_result(json, index)

imin: 4360

In [None]:
values[columns + [
    'creation date',
    'transferred repositories'
] + keyword_columns ].to_csv(
    'community-organizations-phase-two.csv'
)

## Phase 3: browse through organizations with transferred repos

In [None]:
values = pd.read_csv('community-organizations-phase-two.csv', index_col=0, parse_dates=['creation date'], dtype={
    'members': 'UInt32',
    'repositories': 'UInt32',
    'stars': 'UInt32',
    'collaborators': 'UInt32'
}).sort_values('transferred repositories', ascending=False)

Organizations with one transferred repository from before their creation represent 35% of the remaining organizations:

In [None]:
len(values[values['transferred repositories'] > 0]) / len(values)

0.35151791828349693

And organizations with two transferred repositories from before their creation represent about 20% of the same organizations:

In [None]:
len(values[values['transferred repositories'] > 1]) / len(values)

0.21410636840903904

In [None]:
len(values[values['transferred repositories'] > 1])

938

In [None]:
values[values['transferred repositories'] > 1].sort_values('creation date')[0:30]

Unnamed: 0,name,description,url,members,repositories,stars,collaborators,creation date,transferred repositories,keyword app,...,keyword shared,keyword sharing,keyword support,keyword supporter,keyword supporters,keyword supporting,keyword together,keyword unofficial,keyword user,keyword users
datadesk,Los Angeles Times Data Desk,"Analysis, applications and automation from a t...",https://www.latimes.com,8,184,313,27,2010-07-02 02:04:07+00:00,6.0,,...,,,,,,,,,,
collective,Collective,Plone add-ons shared code repositories,https://collective.github.io,268,1674,569,628,2010-08-13 00:04:43+00:00,7.0,,...,True,,,,,,,,,
uncopenweb,UNC Open Web Group,,http://sites.google.com/site/uncopenweb/,11,23,15,2,2010-09-04 01:22:47+00:00,6.0,,...,,,,,,,,,,
PerlDancer,PerlDancer,The Dancer Developers group,http://perldancer.org,10,71,708,15,2010-09-21 12:27:49+00:00,2.0,,...,,,,,,,,,,True
symphonists,Symphony Community,,https://www.getsymphony.com,12,106,47,13,2010-10-21 15:40:12+00:00,56.0,,...,,,,,,,,,,
libtom,libtom,libtom projects,http://www.libtom.net,3,7,859,22,2010-10-22 09:12:56+00:00,5.0,,...,,,,,,,,,,
xcore,XCore open source project,,github.xcore.com,26,119,75,7,2011-01-13 14:16:30+00:00,3.0,,...,,,,,,,,,,
silverstripe-archive,SilverStripe Archive,Archive of unsupported SilverStripe modules. I...,http://silverstripe.org,10,71,72,11,2011-01-17 00:22:34+00:00,4.0,,...,,,True,,,,,,,
mapbox,Mapbox,Mapbox is the location data platform for mobil...,https://www.mapbox.com,62,812,4700,458,2011-02-04 19:02:13+00:00,4.0,,...,,,,,,,,,,
openstate,Open State Foundation,Open State Foundation promotes digital transpa...,https://openstate.eu,18,107,23,13,2011-03-15 21:42:43+00:00,2.0,,...,,,,,,,,,,


###Â A list of instances of this model of community organizations

In [None]:
for org in [
    'coq-community',
    'dlang-community',
    'elm-community',
    'elytra',
    'fluent-plugins-nursery',
    'ocaml-community',
    'react-native-community',
    'reasonml-community',
    'electron-userland',
    'fsprojects',
    'sous-chefs',
    'voxpupuli'
]:
    print(values.loc[org][columns + ['creation date']])
    print()

name                                                 coq-community
description      A project for a collaborative, community-drive...
url                     https://github.com/coq-community/manifesto
members                                                         27
repositories                                                    22
stars                                                          112
collaborators                                                   27
creation date                            2017-12-11 16:11:12+00:00
Name: coq-community, dtype: object

name                                            D Community hub
description                Community hub for popular D projects
url              https://github.com/dlang-community/discussions
members                                                       8
repositories                                                 24
stars                                                       293
collaborators                               

## Future work: find meta-repositories

We fetch metrics on all the repositories of a given organization (or just the 100 most...) and we hope to find the meta-repository as an outlier for some metrics.

In [None]:
query = """
query repoMetrics($org: String!) {
  organization(login: $org) {
    repositories(first: 100, orderBy: {field: STARGAZERS, direction: DESC}) {
      nodes {
        name
        createdAt
        diskUsage
        issues(first:1) {
          totalCount
          nodes {
            createdAt
            comments { totalCount }
          }
        }
        pullRequests(first:1) {
          totalCount
          nodes {
            createdAt
            comments { totalCount }
          }
        }
        isFork
        forkCount
        stargazers { totalCount }
        languages {
          totalSize
          totalCount
        }
        primaryLanguage { name }
      }
    }
  }
}
"""