Download as txt, pdf, or txt
Download as txt, pdf, or txt
You are on page 1of 5

DSBDA ASSIGNMENT 1

Importing Libraries

import pandas as pd

Loading csv file

data= pd.read_csv("C:/Users/Neha/Desktop/FOLDERS/6th
SEM/DSBDA_PRACTICALS/StudentsPerformance.csv")
data.columns
Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
'test preparation course', 'math score', 'reading score',
'writing score'],
dtype='object')

data.gender
0 female
1 female
2 female
3 male
4 male
...
995 female
996 male
997 female
998 female
999 female
Name: gender, Length: 1000, dtype: object

data['math score']
0 72
1 69
2 90
3 47
4 76
..
995 88
996 62
997 59
998 68
999 77
Name: math score, Length: 1000, dtype: int64

data.isnull() #checks if there are any null values


gender race/ethnicity parental level of education lunch test
preparation course math score reading score writing score
0 False False False False False False False False
1 False False False False False False False False
2 False False False False False False False False
3 False False False False False False False False
4 False False False False False False False False
... ... ... ... ... ... ... ... ...
995 False False False False False False False False
996 False False False False False False False False
997 False False False False False False False False
998 False False False False False False False False
999 False False False False False False False False
data1= pd.read_csv("C:/Users/Neha/Desktop/FOLDERS/6th
SEM/DSBDA_PRACTICALS/score.csv")
data1.columns
Index(['Name', 'M1 Score', 'M2 Score'], dtype='object')

WRITING TO A CSV FILE USING

1)USING csv.writer

import csv
header=['Name','M1 Score','M2 Score']
data1 = [['Alex', 62, 80], ['Brad', 45, 56], ['Joey', 85, 98]]
filename = 'score.csv'
with open(filename, 'w', newline="") as file:
csvwriter = csv.writer(file) # create a csvwriter object

csvwriter.writerow(header) # write the header


csvwriter.writerows(data1) # write the rest of the data

2) USING csv.writer

import csv
header=['Name','M1 Score','M2 Score']
data1 = [['Alex', 62, 80], ['Brad', 45, 56], ['Joey', 85, 98]]
filename = 'score.csv'
with open(filename, 'w', newline="") as file:
csvwriter=csv.writer(file) #create csvwriter object
csvwriter.writerow(header) #write header
csvwriter.writerows(data) #write the rest of the data

3) USING .writelines()

header=['Name','M1 Score','M2 Score']


data1=[['Alex', 62, 80], ['Brad', 45, 56], ['Joey', 85, 98]]
filename = 'score.csv'
with open(filename, 'w') as file:
for header in header:
file.write(str(header)+',')
file.write('\n')
for row in data1:
for x in row:
file.write(str(x)+',')
file.write('\n')

4)USING pandas

header = ['Name', 'M1 Score', 'M2 Score']


data1 = [['Alex', 62, 80], ['Brad', 56], ['Joey', 85, 98],[85,98]]
data1 = pd.DataFrame(data1, columns=header)
data1.to_csv('score.csv', index=False) #The index=False argument specifies that
the row indices should not be included in the CSV file.

data1.isnull()
Name M1 Score M2 Score
0 False False False
1 False False True
2 False False False
3 False False True
data1.Name.isnull()
0 False
1 False
2 False
3 False
Name: Name, dtype: bool

data1.isnull().sum()
Name 0
M1 Score 0
M2 Score 2
dtype: int64

Count NaN(Not a Number) values in Pandas DataFrame

1) Using Numpy

import pandas as pd
import numpy as np

data2={'set_of_numbers':[1,2,3,4,5,np.nan,6,7,np.nan,np.nan,8,9,10,np.nan]}
df=pd.DataFrame(data2)
print(df)
set_of_numbers
0 1.0
1 2.0
2 3.0
3 4.0
4 5.0
5 NaN
6 6.0
7 7.0
8 NaN
9 NaN
10 8.0
11 9.0
12 10.0
13 NaN

Count number of NaN values in the DataFrame

import pandas as pd
import numpy as np

data3={'first_set':[1,2,3,4,5,np.nan,6,7,np.nan,np.nan],
'second_set':['a','b',np.nan,np.nan,'c','d','e',np.nan,np.nan,'f'],
'third_set':['aa',np.nan,'bb','cc',np.nan,np.nan,'dd',np.nan,np.nan,'ee']}
df1=pd.DataFrame(data3,columns=['first_set','second_set','third_set'])
print(df1)
first_set second_set third_set
0 1.0 a aa
1 2.0 b NaN
2 3.0 NaN bb
3 4.0 NaN cc
4 5.0 c NaN
5 NaN d NaN
6 6.0 e dd
7 7.0 NaN NaN
8 NaN NaN NaN
9 NaN f ee

# Count NaNvalues under a single DataFramecolumn


count_nan=df1['first_set'].isna().sum()
print('count of NaN :'+ str(count_nan))
count of NaN :3

# Count NaN values under the entire DataFrame


count_nan_full=df1.isna().sum().sum()
print('count of entire NaN :'+ str(count_nan_full))
count of entire NaN :12

# Count NaN values across a single DataFramerow


count_nan_row=df1.loc[[7]].isna().sum().sum()
print('count of row NaN :'+ str(count_nan_row))
count of row NaN :2

Remove Duplicates from Pandas DataFrame

import pandas as pd
boxes={'Color': ['Green','Green','Green','Blue','Blue','Red','Red','Red'],
'Shape':
['Rectangle','Rectangle','Square','Rectangle','Square','Square','Square','Rectangle
']}
df2=pd.DataFrame(boxes,columns=['Color','Shape'])
print(df2)
Color Shape
0 Green Rectangle
1 Green Rectangle
2 Green Square
3 Blue Rectangle
4 Blue Square
5 Red Square
6 Red Square
7 Red Rectangle

df2_duplicates_removed=df2.drop_duplicates()
print(df2_duplicates_removed)
#only the distinct values across the two columns remain
Color Shape
0 Green Rectangle
2 Green Square
3 Blue Rectangle
4 Blue Square
5 Red Square
7 Red Rectangle

# to remove the duplicates on a specific column


df2_duplicates_removed=df2.drop_duplicates(subset=['Color'])
print(df2_duplicates_removed)
Color Shape
0 Green Rectangle
3 Blue Rectangle
5 Red Square

df2_duplicates_removed=df2.drop_duplicates(subset=['Shape'])
print(df2_duplicates_removed)
Color Shape
0 Green Rectangle
2 Green Square

# Get the Descriptive Statistics for Pandas DataFram

#step 1 -- collect data


#step 2 --create dataframe
import pandas as pd

data_a = {'product': ['A', 'B', 'C', 'c','D'], 'price': [22000, 27000, 25000,
29000, 35000], 'year': [2014, 2015, 2016, 2017, 2018] }

df_a = pd.DataFrame(data_a)
df_a
product price year
0 A 22000 2014
1 B 27000 2015
2 C 25000 2016
3 c 29000 2017
4 D 35000 2018

#step 3 -- Get the Descriptive Statistics for Pandas DataFrame

# Descriptive Statistics for column price


stats_numeric=df_a['price'].describe()
stats_numeric
count 5.000000
mean 27600.000000
std 4878.524367
min 22000.000000
25% 25000.000000
50% 27000.000000
75% 29000.000000
max 35000.000000
Name: price, dtype: float64

# Descriptive Statistics for column product


stats_categorical= df_a['product'].describe()
stats_categorical
count 5
unique 5
top A
freq 1
Name: product, dtype: object

# Descriptive Statistics for entire dataframe


stats=df_a.describe(include='all')
stats
product price year
count 5 5.000000 5.000000
unique 5 NaN NaN
top A NaN NaN
freq 1 NaN NaN
mean NaN 27600.000000 2016.000000
std NaN 4878.524367 1.581139
min NaN 22000.000000 2014.000000
25% NaN 25000.000000 2015.000000
50% NaN 27000.000000 2016.000000
75% NaN 29000.000000 2017.000000
max NaN 35000.000000 2018.000000

You might also like