# -*- coding: utf-8 -*-
"""
Created on Sun Oct 4 20:25:31 2015
@author: Abhishek
"""
import pandas
import numpy
data = pandas.read_csv('gapminder.csv', low_memory=False)
pandas.set_option('display.float_format', lambda x: '%f'%x)
data['femaleemployrate'] = data['femaleemployrate'].convert_objects(convert_numeric=True)
data['incomeperperson'] = data['incomeperperson'].convert_objects(convert_numeric=True)
data['polityscore'] = data['polityscore'].convert_objects(convert_numeric=True)
dataG20Copy = data[(data['country'] == 'Argentina') |
(data['country'] == 'Australia') |
(data['country'] == 'Brazil') |
(data['country'] == 'Canada') |
(data['country'] == 'China') |
(data['country'] == 'France') |
(data['country'] == 'Germany') |
(data['country'] == 'India') |
(data['country'] == 'Indonesia') |
(data['country'] == 'Italy') |
(data['country'] == 'Japan') |
(data['country'] == 'Mexico') |
(data['country'] == 'Russia') |
(data['country'] == 'Saudi Arabia') |
(data['country'] == 'South Africa') |
(data['country'] == 'Korea, Rep.') |
(data['country'] == 'Turkey') |
(data['country'] == 'United Kingdom') |
(data['country'] == 'United States')]
# Not always necessary but can eliminate a setting with copy warning that is displayed
dataG20 = dataG20Copy.copy()
print('FEMALE EMP RATE: 4 Quartiles')
data['femaleemployrate4'] = pandas.qcut(data.femaleemployrate,4,labels=["1=25%tile","2=50%tile","3=75%tile","4=100%tile"])
qF = data['femaleemployrate4'].value_counts(sort=True,dropna=True,normalize=True) * 100
print(qF)
#print(pandas.crosstab(data['femaleemployrate'],data['femaleemployrate4']))
print('INCOME PER PERSON: 4 Quartiles')
dataG20['incomeperperson4'] = pandas.qcut(dataG20.incomeperperson,4,labels=["1=25%tile","2=50%tile","3=75%tile","4=100%tile"])
gdpQ = dataG20['incomeperperson4'].value_counts(sort=False,dropna=True, normalize=True) * 100
print(gdpQ)
Sunday, October 4, 2015
Data Management
It turns out that I have already done some data management in my previous assignment. While calculating the frequency distribution of female employment rate, I grouped the employment rates into 4 groups. And while calculating GDP, I applied similar grouping techniques. I feel the Gapminder data set is rather straight forward and does not require much management.
Instead of random groups, I will break down female employment rate and GDP into quartiles for this assignment.
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment