Fetching Pypi data

This notebook fetches every package from the pypi server (~74000 packages!), extracts the setup.py and any file or folder with the word 'requirements' in it.

In [ ]:
import xmlrpclib
# only one api server so we'll use the deutschland mirror for downloading
client = xmlrpclib.ServerProxy('http://pypi.python.org/pypi')
packages = client.list_packages()
In [ ]:
import tarfile, re, requests, csv, json
from base64 import b64encode
from kglib.utils.HelperFunctions import ensure_dir
                

def _save_file(pathname, member, tar_file):
    try:
        content = tar_file.extractfile(member).read()
    except:
        return
    
    outfilename = '{}{}'.format(pathname, os.path.basename(member.name))
    ensure_dir(outfilename)
    with open(outfilename, 'w') as outfile:
        outfile.write(content)
    return
                

def _extract_files(package_file, name):
    try:
        tar_file = tarfile.open(fileobj=package_file)
    except:
        return
    for member in tar_file.getmembers():
        if 'setup.py' in member.name or 'requirements' in member.name:
            _save_file(name, member, tar_file)
        #    content = tar_file.extractfile(member).read()
        #    with open('{}{}'.format(name, os.path.basename(member.name)), 'w') as outfile:
        #        outfile.write(content)
        #elif 'requirements' in member.name:
        #    content = tar_file.extractfile(member).read()
        #    with open('{}{}'.format(name, os.path.basename(member.name)), 'w') as outfile:
        #        outfile.write(content)
                
                
def extract_package(name, client=xmlrpclib.ServerProxy('http://pypi.python.org/pypi')):
    for release in client.package_releases(name):
        outdir = 'packages/{}-{}/'.format(name, release)
        doc = client.release_urls(name, release)
        if doc:
            url = None
            for d in doc:
                if d['python_version'] == 'source' and d['url'].endswith('gz'):
                    url = d['url']
            if url:
                #print(doc[3])
                #url = doc[0].get('url')#.replace("http://pypi.python.org/", "http://f.pypi.python.org/")
                #print "Downloading url %s" % url
                req = requests.get(url)
                if req.status_code != 200:
                    print "Could not download file %s" % req.status_code
                else:
                    #print(outdir)
                    ensure_dir('{}'.format(outdir))
                    with open('/tmp/temp_tar', 'w') as tar_file:
                        tar_file.write(req.content)
                    with open('/tmp/temp_tar', 'r') as tar_file:
                        return _extract_files(tar_file, name=outdir)
In [ ]:
ensure_dir('packages')
#for i, package in enumerate(packages[5300+14100+12400+18500:]):
for i, package in enumerate(packages):
    if i % 100 == 0:
        print('Extracting package {} / {}'.format(i+1, len(packages)))
    #print(package)
    
    extract_package(package, client)

We now have the setup.py and requirements files for every pypi package. I use my own fork of this repository to find the requirements for every package with the following script:

for p in packages/*
do
  echo $p
  detect-requirements $p
  echo ''
done

I will parse the output of this using a different notebook.

In [ ]: