Reading an entire file at once: Import csv
filename = '[Link]' With open(“[Link]”) as file_pointer: Generating current date
with open(filename) as f_obj: csv_pointer = [Link](file_pointer) from datetime import datetime as dt
contents = f_obj.read() Writing data can be written from list of strings today = [Link]()
print(contents) date_string = [Link](today,’%m/%d/%Y’)
data_row = [‘’_‘’,’’_ ‘’ ] print(date_string)
Reading line by line with open(“[Link]”,”w”) as file_pointer:
filename = '[Link]' csv_pointer = [Link](file_pointer) Generating a specific date
with open(filename) as f_obj: csv_pointer.writerow(data_row) from datetime import datetime as dt
for line in f_obj: new_years = dt(2017,1,1)
print([Link]()) import seaborn as sns fall_equinox = dt(year=2016, month=9,day=22)
import numpy as np
Storing the lines in a list: lower_triangle = [Link]([Link]())
filename = '[Link]' [Link]([Link](), annot=True, mask=lower_triangle,
with open(filename) as f_obj: cmap= 'coolwarm') Create Dataframe
lines = f_obj.readlines() # triu() is a method in NumPy that returns the lower triangle of any Method 1: import pandas as pd
for line in lines: matrix given to it. listlist = [['John', 21], ['Emilie', 49]]
print([Link]()) df_listlist = [Link](listlist, columns = ['Name', 'Age'])
[Link][:,["XXX", "YYY"]].[Link]() boxplot df_listlist
file_pointer.read() #Read as single string [Link][:,["XXX", "YYY"]].[Link](alpha=0.5)overlapped colour
file_pointer.readline() #Read line by line as string diagram Method 2:
file_pointer.readlines()#Read as a single list [Link](x='product1_price', y='product1_quantity_ordered'); dictlist = {'Name':['Robert', 'Faye'], 'Age':[56, 28]}
scatter diagram df_dictlist = [Link](dictlist)
file_pointer = open(“[Link]”,”w”) [Link](df)
data_list = [ “D1”,”D2”] import pandas as pd df_dictlist
for each line in data_list: import [Link] as plt
print(each_line,file=file_pointer) %matplotlib inline Method 3:
[Link]()
name = ['Nada', 'Gareth', 'Johnny', 'Georgina']
[Link][:, 2:8].sum().plot(kind="bar") age = [25, 30, 26, 22]
file_pointer.write(s) #s will be the string variable containing date [Link][:,2:8].sum().sort_values().plot(kind="bar")ascendin list_of_tuples = list(zip(name, age))
file_pointer.writelines(sequence) #write a sequence of strings to g order print(list_of_tuples) [('Nada', 25), ('Gareth', 30)]
the file [Link]("Distribution df = [Link](list_of_tuples, columns = ['Name', 'Age'])
Mode How opened File Exists File does not Channel").sum().[Link](kind="bar") only df
exists distribution channel 1&2
r Read only Opens file Error df_total = df[[Link] == "Total Population Growth"] data = pd.read_csv(‘[Link]')
w Write only Clears the file Creates & df_resident = df[[Link] == "Resident Population Growth"]
contents opens new fig, ax = [Link]() DataFrame Functions:
file [Link](df_total.Year, df_total.Growth) [Link]() first 5 rows(default)
a Write only File contents Creates & [Link](df_resident.Year, df_resident.Growth) [Link]()last 5 rows(default) , unless replaced by number
left intact & opens a new #[Link](["Total Population Growth", "Resident Population [Link]column labels of DataFrame
new data file Growth"]) [Link]() descriptive statistics abt every numeric
appended at [Link]([Link]()) column ie count,mean,std
file’s end
[Link]()/isna().sum()check total [Link] empty cells for
r+ Read & write Reads & Error import seaborn as sns each column
overwrites from df = pd.read_csv("[Link]") df[df[‘column_name’].isnull()]show NaN in that column
the file’s [Link](df) df[[Link]().any(axis=1)]show rows in DataFrame are NaN
beginning
df[‘XXX’].fillna((df[‘XXX’].mean().round(2),inplace=True)fill
w+ Read & write Clears the file Creates & empty cells in specified column with its column mean
contents opens a new Making a scatter plot df=[Link](how=’any’)delete rows with at least 1 empty
file import [Link] as plt cell
a+ Read & write File contents Creates & x_values = list(range(1000))
[Link]()/mean()/median()/mode()/var()/cov()/sum()/min()/
left intact and opens a new squares = [x**2 for x in x_values]
max()
read & write at file [Link](x_values, squares, s=10)
file’s end [Link]()
[Link]().sum()total no. of rows with duplicated df[[Link] >= 15.0].sort_values("score", ascending=False)
values [["name","score"]] or INSERT INTO MANAGER(AA,BB) VALUES (1003, "Jenny");
df[df[[‘AA’,’BB’]].duplicated()show only those rows with [Link][[Link] >= 15.0, SELECT * FROM __ * denote list out all fields
duplicated AA and BB ["name","score"]].sort_values("score", ascending=False) SELECT A,B / DISTINCT/ COUNT(*) / MIN(XXX)/ MAX(XXX)/
df.drop_duplicates(inplace=True) data[(data[‘XXX’] >= 9) & (data[‘YYY']<= 4)]
SUM(XXX) AS TOTAL_VALUE/ AVG(XXX) FROM __
[Link](columns = [‘??’], inplace = True)drop columns filtered.reset_index(drop = True, inplace = True)
[Link](row_index_number) WHERE XXX >= 12
data[(data[‘XXX’] >= 9) | (data[‘YYY']<= 4)] ‘OR’
[Link]([‘column_name’]).mean()/.sum()/.count() <>(not equal), ^(to the power of)
data[data[‘XXX’].[Link]/endswith/startswith(‘aaa’)]
[Link]([‘XXX’]).count()[‘YYY’].plot(kind=’bar’) no. of BETWEEN, IS NULL, LIKE, EXISTS, IN, AND
data[~data[‘XXX’].[Link](‘aaa’)] print those w/o ‘aaa’
YYY under each XXX row = ["Peter", 18.0, 2, "yes"]
[Link](“aaa”,”AAA”, inplace=True)all aaa in [Link][len(df)] = row
dataFrame vertical_stack=[Link]([XX_df.head(5),
add new row to the bottom
[Link][3,1]= 14.6 replace particular entry only YY_df.tail(10)],axis=0)
1. Display all album titles by Iron Maiden vertical_stack.reset_index(inplace=True) renumber the
DataFrame Attributes: index
SELECT Title,
[Link](rows,columns) horizontal_stack=[Link]([XX_df.head(5),
Name
[Link]()check empty cells or data type of each column YY_df.tail(10)],axis=1)
FROM albums
[Link] read column,if column name w/o empty spaces Combining data
INNER JOIN
data[[‘XXX’,’YYY’]]
artists ON [Link] = [Link]
type(df.column_name) WHERE Name = "Iron Maiden";
2. Find all artists who do not have any albums, sorted by
Indexing: artists’ names
[Link][1]/[Link][1,:] SELECT Name,
[Link][3,4]read (row,column) Title
[Link][[0,2]] row index 0 and 2 FROM artists
[Link][0,2]select cell at row 0 and column 2 LEFT JOIN
[Link][:4]/[Link][:4,:]read row 0 to 3 albums ON [Link] = [Link]
[Link][[0,1],[0,1]]select row 0 &1, column 0 & 1 WHERE Title IS NULL
[Link][2:4,5:7]select row 2 & 3, and give value from ORDER BY Name;
column 5 & 6 3. Display all unique cities for each country where the
[Link][df.column_name>5000, customers are located in
col_start_index:col_stop_index_exclusive]
[Link][df.column_name ==
SELECT DISTINCT city, merged_inner = [Link](left=XXX,right=YYY) if a row in
country XXX has a value of AA that doesn’t appear in AA column of
“???”,col_start_index:col_stop_index_exclusive] FROM customers YYY, it will not be included
[Link][:,’XXX’]read (row,column_name) ORDER BY country;
[Link][[0,2]]row index 0 & 2 4. Find the number of tracks for each album ID
[Link][2:4,’XXX’]read row 2 to 4, in that particular column SELECT albumid,
[Link][df['gender']=='Female', COUNT(trackid)
[‘column_name1’,’column_name2’] FROM tracks
[Link][0,['EmpID']] EmpID 21 (read down) GROUP BY albumid;
[Link][[0],'EmpID'] 0 21 (read across) 5. Find the total length and bytes for each album ID
[Link][0,'EmpID'] 21 (read across) SELECT albumid,
[Link][[0],["EmpID"]] 0 21 , with column name on top SUM(milliseconds) AS length,
(read down) SUM(bytes) AS size
FROM track
df.sort_values(‘XXX’)sort one column in ascending order GROUP BY albumid;
df.sort_values(‘XXX’,ascending=False)sort in descending 6. Display the track IDs & track names for which media type id
order is 1 or 2
df.sort_values([‘XXX’,’YYY’],ascending = (True, False) ) SELECT TrackId,
#not saved, if want need to add ,inplace =True Name,
data[‘ZZZ’] = data[‘XXX’] * data[‘YYY’] Mediatypeid
data[‘ZZZ’] = [Link][:,[‘XXX’,’YYY’]].sum(axis=1)add FROM Tracks
column ZZZ=XXX+YYY WHERE MediaTypeId IN (1, 2);