Dsbda Assignment 1
Dsbda Assignment 1
Importing Libraries
import pandas as pd
data= pd.read_csv("C:/Users/Neha/Desktop/FOLDERS/6th
SEM/DSBDA_PRACTICALS/StudentsPerformance.csv")
data.columns
Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
'test preparation course', 'math score', 'reading score',
'writing score'],
dtype='object')
data.gender
0 female
1 female
2 female
3 male
4 male
...
995 female
996 male
997 female
998 female
999 female
Name: gender, Length: 1000, dtype: object
data['math score']
0 72
1 69
2 90
3 47
4 76
..
995 88
996 62
997 59
998 68
999 77
Name: math score, Length: 1000, dtype: int64
1)USING csv.writer
import csv
header=['Name','M1 Score','M2 Score']
data1 = [['Alex', 62, 80], ['Brad', 45, 56], ['Joey', 85, 98]]
filename = 'score.csv'
with open(filename, 'w', newline="") as file:
csvwriter = csv.writer(file) # create a csvwriter object
2) USING csv.writer
import csv
header=['Name','M1 Score','M2 Score']
data1 = [['Alex', 62, 80], ['Brad', 45, 56], ['Joey', 85, 98]]
filename = 'score.csv'
with open(filename, 'w', newline="") as file:
csvwriter=csv.writer(file) #create csvwriter object
csvwriter.writerow(header) #write header
csvwriter.writerows(data) #write the rest of the data
3) USING .writelines()
4)USING pandas
data1.isnull()
Name M1 Score M2 Score
0 False False False
1 False False True
2 False False False
3 False False True
data1.Name.isnull()
0 False
1 False
2 False
3 False
Name: Name, dtype: bool
data1.isnull().sum()
Name 0
M1 Score 0
M2 Score 2
dtype: int64
1) Using Numpy
import pandas as pd
import numpy as np
data2={'set_of_numbers':[1,2,3,4,5,np.nan,6,7,np.nan,np.nan,8,9,10,np.nan]}
df=pd.DataFrame(data2)
print(df)
set_of_numbers
0 1.0
1 2.0
2 3.0
3 4.0
4 5.0
5 NaN
6 6.0
7 7.0
8 NaN
9 NaN
10 8.0
11 9.0
12 10.0
13 NaN
import pandas as pd
import numpy as np
data3={'first_set':[1,2,3,4,5,np.nan,6,7,np.nan,np.nan],
'second_set':['a','b',np.nan,np.nan,'c','d','e',np.nan,np.nan,'f'],
'third_set':['aa',np.nan,'bb','cc',np.nan,np.nan,'dd',np.nan,np.nan,'ee']}
df1=pd.DataFrame(data3,columns=['first_set','second_set','third_set'])
print(df1)
first_set second_set third_set
0 1.0 a aa
1 2.0 b NaN
2 3.0 NaN bb
3 4.0 NaN cc
4 5.0 c NaN
5 NaN d NaN
6 6.0 e dd
7 7.0 NaN NaN
8 NaN NaN NaN
9 NaN f ee
import pandas as pd
boxes={'Color': ['Green','Green','Green','Blue','Blue','Red','Red','Red'],
'Shape':
['Rectangle','Rectangle','Square','Rectangle','Square','Square','Square','Rectangle
']}
df2=pd.DataFrame(boxes,columns=['Color','Shape'])
print(df2)
Color Shape
0 Green Rectangle
1 Green Rectangle
2 Green Square
3 Blue Rectangle
4 Blue Square
5 Red Square
6 Red Square
7 Red Rectangle
df2_duplicates_removed=df2.drop_duplicates()
print(df2_duplicates_removed)
#only the distinct values across the two columns remain
Color Shape
0 Green Rectangle
2 Green Square
3 Blue Rectangle
4 Blue Square
5 Red Square
7 Red Rectangle
df2_duplicates_removed=df2.drop_duplicates(subset=['Shape'])
print(df2_duplicates_removed)
Color Shape
0 Green Rectangle
2 Green Square
data_a = {'product': ['A', 'B', 'C', 'c','D'], 'price': [22000, 27000, 25000,
29000, 35000], 'year': [2014, 2015, 2016, 2017, 2018] }
df_a = pd.DataFrame(data_a)
df_a
product price year
0 A 22000 2014
1 B 27000 2015
2 C 25000 2016
3 c 29000 2017
4 D 35000 2018