This shows you the differences between two versions of the page.
|
software_engineering:languages [2012/07/06] emilmont |
software_engineering:languages [2013/01/08] (current) emilmont |
||
|---|---|---|---|
| Line 2: | Line 2: | ||
| ===== Open Source Activity ===== | ===== Open Source Activity ===== | ||
| The 9 languages with more users in open source according to <html><a href ="http://www.ohloh.net"><img src="http://www.ohloh.net/images/badges/mini.gif" width="80" height="15" /></a></html>: | The 9 languages with more users in open source according to <html><a href ="http://www.ohloh.net"><img src="http://www.ohloh.net/images/badges/mini.gif" width="80" height="15" /></a></html>: | ||
| + | - JavaScript | ||
| - C | - C | ||
| - Java | - Java | ||
| - | - JavaScript | ||
| - C++ | - C++ | ||
| - Python | - Python | ||
| - PHP | - PHP | ||
| + | - C# | ||
| + | - Ruby | ||
| - Perl | - Perl | ||
| - | - Ruby | ||
| - | - C# | ||
| - | GitHub repository: https://github.com/emilmont/LangPop | + | GitHub repository: |
| - | + | * [[https://github.com/emilmont/Experiments/blob/master/langpop.py|Language Popularity]] | |
| - | <code python> | + | * [[https://github.com/emilmont/Algorithms/blob/master/algorithms/clusters/hierarchical.py|Hierarchical Clustering]] |
| - | from numpy import array, mean | + | |
| - | from numpy.linalg import norm | + | |
| - | + | ||
| - | class Item: | + | |
| - | def __init__(self, name, values): | + | |
| - | self.name = name | + | |
| - | self.values = array(values) | + | |
| - | + | ||
| - | def distance(self, other): | + | |
| - | return norm(self.values - other.values) | + | |
| - | + | ||
| - | def __cmp__(self, other): | + | |
| - | return cmp(norm(self.values), norm(other.values)) | + | |
| - | + | ||
| - | class Group(Item): | + | |
| - | def __init__(self, items): | + | |
| - | self.items = items | + | |
| - | self.values = mean(array([i.values for i in self.items]), axis=0) | + | |
| - | + | ||
| - | def __str__(self): | + | |
| - | self.items.sort(reverse=True) | + | |
| - | return "[%s]" % ', '.join([i.name for i in self.items]) | + | |
| - | + | ||
| - | def pop_items(l, indexes): | + | |
| - | indexes.sort(reverse=True) | + | |
| - | return [l.pop(index) for index in indexes] | + | |
| - | + | ||
| - | def find_clusters(items, cluster_num): | + | |
| - | # Initially create a group for each item | + | |
| - | groups = [Group([item]) for item in items] | + | |
| - | cache_dist = {} | + | |
| - | + | ||
| - | # Iterate until the number of groups match the desired number of clusters | + | |
| - | while len(groups) > cluster_num: | + | |
| - | + | ||
| - | # Find the closest pair of groups | + | |
| - | closest_pair, shortest_distance = None, None | + | |
| - | for a in range(len(groups)): | + | |
| - | for b in range(a+1, len(groups)): | + | |
| - | + | ||
| - | # Calculate distance | + | |
| - | dist_id = (id(groups[a]), id(groups[b])) | + | |
| - | if dist_id not in cache_dist: | + | |
| - | cache_dist[dist_id] = groups[a].distance(groups[b]) | + | |
| - | distance = cache_dist[dist_id] | + | |
| - | + | ||
| - | # Keep shortest distance | + | |
| - | if shortest_distance is None or distance < shortest_distance: | + | |
| - | shortest_distance = distance | + | |
| - | closest_pair = [a, b] | + | |
| - | + | ||
| - | # Merge the closest pair of groups | + | |
| - | a, b = pop_items(groups, closest_pair) | + | |
| - | groups.append(Group(a.items + b.items)) | + | |
| - | + | ||
| - | return groups | + | |
| - | </code> | + | |
| - | + | ||
| - | <code python> | + | |
| - | #!/usr/bin/python | + | |
| - | from urllib import urlencode, urlopen | + | |
| - | from os.path import exists, join | + | |
| - | from os import makedirs | + | |
| - | from hashlib import md5 | + | |
| - | from collections import defaultdict | + | |
| - | import xml.etree.ElementTree as et | + | |
| - | from datetime import date | + | |
| - | from argparse import ArgumentParser | + | |
| - | + | ||
| - | from cluster import Item, find_clusters | + | |
| - | + | ||
| - | NOT_GENERAL_PROGRAMMING = [ | + | |
| - | 'HTML', 'CSS', 'Haml', 'ClearSilver', | + | |
| - | 'XML', 'XSL Transformation', 'XML Schema', 'MXML', 'XAML', 'QML', | + | |
| - | 'shell script', 'DOS batch script', 'AWK', 'Vim Script', 'DCL', 'NSIS', | + | |
| - | 'Make', 'Automake', 'Autoconf', 'Ebuild', 'CMake', 'Exheres', 'Jam', | + | |
| - | 'TeX/LaTeX', 'MetaFont', 'MetaPost', | + | |
| - | 'SQL', 'IDL/PV-WAVE/GDL', | + | |
| - | 'Assembly', 'OpenGL Shading', 'CUDA', | + | |
| - | 'Matlab', 'Octave', 'R', 'Scilab', | + | |
| - | 'Stratego', 'Puppet', 'VHDL', | + | |
| - | ] | + | |
| - | ALIAS = { | + | |
| - | 'C/C++':'C++', | + | |
| - | # Lisp family | + | |
| - | 'Emacs Lisp': 'Lisp', 'Scheme': 'Lisp', 'Racket': 'Lisp', 'Clojure': 'Lisp', | + | |
| - | # Fortran Family | + | |
| - | 'Fortran (Fixed-format)': 'Fortran', 'Fortran (Free-format)': 'Fortran', | + | |
| - | # Basic Family | + | |
| - | 'Visual Basic': 'Basic', 'Structured Basic': 'Basic', 'Classic Basic': 'Basic', | + | |
| - | # Modula Family | + | |
| - | 'Modula-2': 'Modula', 'Modula-3': 'Modula', 'Oberon': 'Modula' | + | |
| - | } | + | |
| - | THRESHOLD = 0.45 | + | |
| - | + | ||
| - | def get_top_languages(key, sort): | + | |
| - | languages = defaultdict(int) | + | |
| - | params = { | + | |
| - | 'api_key': key, | + | |
| - | 'sort' : sort, | + | |
| - | 'page' : 1 | + | |
| - | } | + | |
| - | while True: | + | |
| - | query = sorted(params.items()) | + | |
| - | url = "http://www.ohloh.net/languages.xml?%s" % urlencode(query) | + | |
| - | date_dir = date.today().strftime("%y_%m_%d") | + | |
| - | filename = md5(url).hexdigest() + '.xml' | + | |
| - | cache_dir = join('data', date_dir) | + | |
| - | if not exists(cache_dir): | + | |
| - | makedirs(cache_dir) | + | |
| - | cache_file = join(cache_dir, filename) | + | |
| - | if exists(cache_file): | + | |
| - | print 'loading:', cache_file | + | |
| - | xml = open(cache_file).read() | + | |
| - | else: | + | |
| - | print 'request:', url | + | |
| - | xml = urlopen(url).read() | + | |
| - | open(cache_file, 'w').write(xml) | + | |
| - | + | ||
| - | root = et.fromstring(xml) | + | |
| - | error = root.find("error") | + | |
| - | if error != None: | + | |
| - | raise Exception('Ohloh Error:', et.tostring(error)) | + | |
| - | + | ||
| - | if root.find("items_returned").text == "0": | + | |
| - | break | + | |
| - | + | ||
| - | for lang_node in root.findall("result/language"): | + | |
| - | name = lang_node.find('nice_name').text | + | |
| - | if name in NOT_GENERAL_PROGRAMMING: | + | |
| - | continue | + | |
| - | if name in ALIAS: | + | |
| - | name = ALIAS[name] | + | |
| - | value = int(lang_node.find(sort).text) | + | |
| - | languages[name] += value | + | |
| - | + | ||
| - | params['page'] += 1 | + | |
| - | + | ||
| - | return sorted([(c, l) for l, c in languages.iteritems()], reverse=True) | + | |
| - | + | ||
| - | + | ||
| - | if __name__ == '__main__': | + | |
| - | parser = ArgumentParser(description='Download the Ohloh languages statistics') | + | |
| - | parser.add_argument('-k', '--key', help='The Ohloh API key', required=True) | + | |
| - | parser.add_argument('-s', '--sort', help='The sorting field', | + | |
| - | choices=['contributors', 'commits', 'code'], default='contributors') | + | |
| - | args = parser.parse_args() | + | |
| - | + | ||
| - | languages = get_top_languages(args.key, args.sort) | + | |
| - | unit = 100.0 / float(languages[0][0]) | + | |
| - | + | ||
| - | items = [] | + | |
| - | for i, (value, lang) in enumerate(languages): | + | |
| - | n = value * unit | + | |
| - | if n < THRESHOLD: break | + | |
| - | print '%2d) %.2f - %s' % (i+1, n, lang) | + | |
| - | items.append(Item(lang, (n,))) | + | |
| - | + | ||
| - | # Find Popularity Clusters | + | |
| - | cluster_names = ('Ubiquitous', 'Very Popular', 'Popular', 'Niche') | + | |
| - | clusters = find_clusters(items, len(cluster_names)) | + | |
| - | clusters.sort(reverse=True) | + | |
| - | + | ||
| - | print "\nPopularity Clusters:" | + | |
| - | for i, label in enumerate(cluster_names): | + | |
| - | print "%s: %s" % (label, clusters[i]) | + | |
| - | </code> | + | |
| <code> | <code> | ||
| - | 1) 100.00 - C | + | 1) 100.00 - JavaScript |
| - | 2) 99.73 - Java | + | 2) 99.99 - C |
| - | 3) 98.37 - JavaScript | + | 3) 99.83 - Java |
| - | 4) 85.98 - C++ | + | 4) 86.29 - C++ |
| - | 5) 66.60 - Python | + | 5) 69.39 - Python |
| - | 6) 52.69 - PHP | + | 6) 52.13 - PHP |
| - | 7) 35.45 - Perl | + | 7) 36.83 - C# |
| - | 8) 34.42 - Ruby | + | 8) 34.84 - Ruby |
| - | 9) 29.90 - C# | + | 9) 34.39 - Perl |
| - | 10) 11.49 - Objective-C | + | 10) 12.41 - Objective-C |
| - | 11) 11.06 - Lisp | + | 11) 11.01 - Lisp |
| - | 12) 9.99 - Modula | + | 12) 9.03 - Modula |
| - | 13) 8.34 - ActionScript | + | 13) 7.98 - ActionScript |
| - | 14) 5.68 - Basic | + | 14) 6.12 - Basic |
| - | 15) 5.39 - Lua | + | 15) 5.31 - Lua |
| - | 16) 3.85 - Pascal | + | 16) 3.99 - Pascal |
| 17) 3.72 - D | 17) 3.72 - D | ||
| - | 18) 3.30 - Groovy | + | 18) 3.45 - Groovy |
| - | 19) 3.23 - Fortran | + | 19) 3.25 - Fortran |
| - | 20) 3.20 - Tcl | + | 20) 3.09 - Tcl |
| - | 21) 2.86 - Haskell | + | 21) 2.84 - Haskell |
| - | 22) 2.26 - Scala | + | 22) 2.44 - Scala |
| - | 23) 1.69 - Erlang | + | 23) 2.23 - CoffeeScript |
| - | 24) 1.53 - Objective Caml | + | 24) 1.72 - Erlang |
| - | 25) 1.08 - CoffeeScript | + | 25) 1.54 - Objective Caml |
| - | 26) 0.84 - Ada | + | 26) 0.81 - Ada |
| - | 27) 0.66 - Vala | + | 27) 0.75 - Go |
| - | 28) 0.63 - Go | + | 28) 0.74 - F# |
| - | 29) 0.55 - F# | + | 29) 0.68 - Vala |
| - | 30) 0.49 - Eiffel | + | 30) 0.46 - Eiffel |
| 31) 0.45 - HaXe | 31) 0.45 - HaXe | ||
| Popularity Clusters: | Popularity Clusters: | ||
| - | Ubiquitous: [C, Java, JavaScript, C++] | + | Ubiquitous: [JavaScript, C, Java, C++] |
| - | Very Popular: [Python, PHP] | + | Very Popular: [Python] |
| - | Popular: [Perl, Ruby, C#] | + | Popular: [PHP, C#, Ruby, Perl] |
| - | Niche: [Objective-C, Lisp, Modula, ActionScript, Basic, Lua, Pascal, D, Groovy, Fortran, Tcl, Haskell, Scala, Erlang, Objective Caml, CoffeeScript, Ada, Vala, Go, F#, Eiffel, HaXe] | + | Niche: [Objective-C, Lisp, Modula, ActionScript, Basic, Lua, Pascal, D, Groovy, Fortran, Tcl, Haskell, Scala, CoffeeScript, Erlang, Objective Caml, Ada, Go, F#, Vala, Eiffel, HaXe] |
| </code> | </code> | ||
| - | |||
| - | Black Duck software provides a wider analysis: [[http://www.blackducksoftware.com/oss/projects|Open Source Project Data]] | ||
| ==== Language Analysis ==== | ==== Language Analysis ==== | ||
| Line 236: | Line 67: | ||
| ==== Packages ==== | ==== Packages ==== | ||
| - | * Python: [[http://pypi.python.org/pypi|22169]] | + | * Ruby: [[https://rubygems.org/stats|41,284]] |
| - | * Perl: [[http://www.cpan.org/modules/01modules.index.html|22115]] | + | * Python: [[http://pypi.python.org/pypi|22,169]] |
| - | * Haskell: [[http://hackage.haskell.org/packages/archive/pkg-list.html|5500]] | + | * Perl: [[http://www.cpan.org/modules/01modules.index.html|22,115]] |
| - | * Ruby: [[http://rubygems.org/gems|2576]] | + | * Haskell: [[http://hackage.haskell.org/packages/archive/pkg-list.html|5,500]] |
| ===== Benchmarks ===== | ===== Benchmarks ===== | ||