This notebook fetches every package from the pypi server (~74000 packages!), extracts the setup.py and any file or folder with the word 'requirements' in it.
import xmlrpclib
# only one api server so we'll use the deutschland mirror for downloading
client = xmlrpclib.ServerProxy('http://pypi.python.org/pypi')
packages = client.list_packages()
import tarfile, re, requests, csv, json
from base64 import b64encode
from kglib.utils.HelperFunctions import ensure_dir
def _save_file(pathname, member, tar_file):
try:
content = tar_file.extractfile(member).read()
except:
return
outfilename = '{}{}'.format(pathname, os.path.basename(member.name))
ensure_dir(outfilename)
with open(outfilename, 'w') as outfile:
outfile.write(content)
return
def _extract_files(package_file, name):
try:
tar_file = tarfile.open(fileobj=package_file)
except:
return
for member in tar_file.getmembers():
if 'setup.py' in member.name or 'requirements' in member.name:
_save_file(name, member, tar_file)
# content = tar_file.extractfile(member).read()
# with open('{}{}'.format(name, os.path.basename(member.name)), 'w') as outfile:
# outfile.write(content)
#elif 'requirements' in member.name:
# content = tar_file.extractfile(member).read()
# with open('{}{}'.format(name, os.path.basename(member.name)), 'w') as outfile:
# outfile.write(content)
def extract_package(name, client=xmlrpclib.ServerProxy('http://pypi.python.org/pypi')):
for release in client.package_releases(name):
outdir = 'packages/{}-{}/'.format(name, release)
doc = client.release_urls(name, release)
if doc:
url = None
for d in doc:
if d['python_version'] == 'source' and d['url'].endswith('gz'):
url = d['url']
if url:
#print(doc[3])
#url = doc[0].get('url')#.replace("http://pypi.python.org/", "http://f.pypi.python.org/")
#print "Downloading url %s" % url
req = requests.get(url)
if req.status_code != 200:
print "Could not download file %s" % req.status_code
else:
#print(outdir)
ensure_dir('{}'.format(outdir))
with open('/tmp/temp_tar', 'w') as tar_file:
tar_file.write(req.content)
with open('/tmp/temp_tar', 'r') as tar_file:
return _extract_files(tar_file, name=outdir)
ensure_dir('packages')
#for i, package in enumerate(packages[5300+14100+12400+18500:]):
for i, package in enumerate(packages):
if i % 100 == 0:
print('Extracting package {} / {}'.format(i+1, len(packages)))
#print(package)
extract_package(package, client)
We now have the setup.py and requirements files for every pypi package. I use my own fork of this repository to find the requirements for every package with the following script:
for p in packages/*
do
echo $p
detect-requirements $p
echo ''
done
I will parse the output of this using a different notebook.