import pandas as pd


df = pd.read_csv("/content/drive/MyDrive/Datasets/adult.data", names = ['age','workclass','fnlwgt','education','education-num', 'marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','<=50K'],header=None,index_col=False)


df.shape

(32561, 15)


df.head()


df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       '<=50K'],
      dtype='object')


model_features = df.columns.drop('<=50K')
model_target = '<=50K'
print('Model Features: ', model_features)
print('Model Target: ', model_target)

Model Features:  Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'],
      dtype='object')
Model Target:  <=50K


from pandas.core.arrays import categorical
import numpy as np
numerical_features_all = df[model_features].select_dtypes(include=np.number).columns
print('Numerical Columns: ', numerical_features_all)
categorical_features_all = df[model_features].select_dtypes(include='object').columns
print('Categorical Columns: ', categorical_features_all)

Numerical Columns:  Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week'],
      dtype='object')
Categorical Columns:  Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country'],
      dtype='object')


df[model_target].value_counts()

 <=50K    24720
 >50K      7841
Name: <=50K, dtype: int64


import matplotlib.pyplot as plt
%matplotlib inline

df[model_target].value_counts().plot.bar(color='cornflowerblue')
plt.show()


# An efficient way to summarize the data in our Categorical Features
for c in categorical_features_all:
  print(df[c].value_counts())

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 ?                    1836
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workclass, dtype: int64
 HS-grad         10501
 Some-college     7291
 Bachelors        5355
 Masters          1723
 Assoc-voc        1382
 11th             1175
 Assoc-acdm       1067
 10th              933
 7th-8th           646
 Prof-school       576
 9th               514
 12th              433
 Doctorate         413
 5th-6th           333
 1st-4th           168
 Preschool          51
Name: education, dtype: int64
 Married-civ-spouse       14976
 Never-married            10683
 Divorced                  4443
 Separated                 1025
 Widowed                    993
 Married-spouse-absent      418
 Married-AF-spouse           23
Name: marital-status, dtype: int64
 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4066
 Adm-clerical         3770
 Sales                3650
 Other-service        3295
 Machine-op-inspct    2002
 ?                    1843
 Transport-moving     1597
 Handlers-cleaners    1370
 Farming-fishing       994
 Tech-support          928
 Protective-serv       649
 Priv-house-serv       149
 Armed-Forces            9
Name: occupation, dtype: int64
 Husband           13193
 Not-in-family      8305
 Own-child          5068
 Unmarried          3446
 Wife               1568
 Other-relative      981
Name: relationship, dtype: int64
 White                 27816
 Black                  3124
 Asian-Pac-Islander     1039
 Amer-Indian-Eskimo      311
 Other                   271
Name: race, dtype: int64
 Male      21790
 Female    10771
Name: sex, dtype: int64
 United-States                 29170
 Mexico                          643
 ?                               583
 Philippines                     198
 Germany                         137
 Canada                          121
 Puerto-Rico                     114
 El-Salvador                     106
 India                           100
 Cuba                             95
 England                          90
 Jamaica                          81
 South                            80
 China                            75
 Italy                            73
 Dominican-Republic               70
 Vietnam                          67
 Guatemala                        64
 Japan                            62
 Poland                           60
 Columbia                         59
 Taiwan                           51
 Haiti                            44
 Iran                             43
 Portugal                         37
 Nicaragua                        34
 Peru                             31
 France                           29
 Greece                           29
 Ecuador                          28
 Ireland                          24
 Hong                             20
 Cambodia                         19
 Trinadad&Tobago                  19
 Laos                             18
 Thailand                         18
 Yugoslavia                       16
 Outlying-US(Guam-USVI-etc)       14
 Honduras                         13
 Hungary                          13
 Scotland                         12
 Holand-Netherlands                1
Name: native-country, dtype: int64


import matplotlib.pyplot as plt
%matplotlib inline

for c in categorical_features_all:
  print(c)
  df[c].value_counts().plot.bar()
  plt.show()

workclass

education

marital-status

occupation

relationship

race

sex

native-country


import matplotlib.pyplot as plt
%matplotlib inline

for n in numerical_features_all:
  print(n)
  df[n].value_counts().plot.hist(bins=5)
  plt.grid(True)
  plt.show()

age

fnlwgt

education-num

capital-gain

capital-loss

hours-per-week

Exploratory Data Analysis, data prep for predicting Income based on census data

Abstract¶

Vimal Octavius PJ

	age	workclass	fnlwgt	education	education-num	marital-status	occupation	relationship	race	sex	capital-gain	hours-per-week	native-country	<=50K
0	39	State-gov	77516	Bachelors	13	Never-married	Adm-clerical	Not-in-family	White	Male	2174	40	United-States	<=50K
1	50	Self-emp-not-inc	83311	Bachelors	13	Married-civ-spouse	Exec-managerial	Husband	White	Male	0	13	United-States	<=50K
2	38	Private	215646	HS-grad	9	Divorced	Handlers-cleaners	Not-in-family	White	Male	0	40	United-States	<=50K
3	53	Private	234721	11th	7	Married-civ-spouse	Handlers-cleaners	Husband	Black	Male	0	40	United-States	<=50K
4	28	Private	338409	Bachelors	13	Married-civ-spouse	Prof-specialty	Wife	Black	Female	0	40	Cuba	<=50K