# Import pandas and read the csv file into a dataframe
import pandas as pd
df = pd.read_csv("huskies_hockey_stats.csv")
df.head(5)
| Season | Date | Opponent | W/L | Score | G | A | PTS | SH | PCT | NO/MIN | PP | SH.1 | EN | GW | GTG | HT | UA | BLK | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020 | 12/11/20 | at Boston College | W | 4-1 | 4 | 7 | 11 | 36 | 0.111 | 2-4 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 10 | 
| 1 | 2020 | 12/13/20 | Boston College | L | 1-2 | 1 | 2 | 3 | 33 | 0.030 | 5-10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 
| 2 | 2020 | 01/03/21 | Maine | W | 3-0 | 3 | 3 | 6 | 47 | 0.064 | 1-2 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 6 | 
| 3 | 2020 | 01/04/21 | Maine | W | 1-0 | 1 | 1 | 2 | 41 | 0.024 | 4-8 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 15 | 
| 4 | 2020 | 01/08/21 | New Hampshire | W | 3-1 | 3 | 3 | 6 | 42 | 0.071 | 3-6 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 16 | 
df.dtypes
Season int64 Date object Opponent object W/L object Score object G int64 A int64 PTS int64 SH int64 PCT float64 NO/MIN object PP int64 SH.1 int64 EN int64 GW int64 GTG int64 HT int64 UA int64 BLK int64 dtype: object
type(df.dtypes)
pandas.core.series.Series
# Summarize the entire dataframe
df.describe()
| Season | G | A | PTS | SH | PCT | PP | SH.1 | EN | GW | GTG | HT | UA | BLK | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 251.000000 | 251.000000 | 251.000000 | 251.000000 | 251.000000 | 251.000000 | 251.000000 | 251.000000 | 251.000000 | 251.000000 | 251.000000 | 251.000000 | 251.000000 | 251.000000 | 
| mean | 2016.872510 | 3.430279 | 5.494024 | 8.924303 | 33.924303 | 0.102920 | 0.693227 | 0.159363 | 0.139442 | 0.653386 | 0.035857 | 0.067729 | 0.286853 | 13.581673 | 
| std | 1.922415 | 2.013484 | 3.514963 | 5.449243 | 8.411792 | 0.059156 | 0.832775 | 0.408047 | 0.380103 | 0.476842 | 0.186304 | 0.251782 | 0.519025 | 5.519810 | 
| min | 2014.000000 | 0.000000 | 0.000000 | 0.000000 | 16.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.000000 | 
| 25% | 2015.000000 | 2.000000 | 3.000000 | 5.000000 | 28.000000 | 0.063000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 9.000000 | 
| 50% | 2017.000000 | 3.000000 | 5.000000 | 8.000000 | 33.000000 | 0.095000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 13.000000 | 
| 75% | 2018.500000 | 5.000000 | 7.000000 | 12.000000 | 38.000000 | 0.138500 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 17.000000 | 
| max | 2020.000000 | 12.000000 | 21.000000 | 33.000000 | 64.000000 | 0.345000 | 4.000000 | 2.000000 | 2.000000 | 1.000000 | 1.000000 | 1.000000 | 3.000000 | 31.000000 | 
# Summarize just one column
df["G"].describe()
count 251.000000 mean 3.430279 std 2.013484 min 0.000000 25% 2.000000 50% 3.000000 75% 5.000000 max 12.000000 Name: G, dtype: float64
# Don't want the HT column anymore? Drop it
df_col_dropped = df.drop("HT", axis = 1)
df_col_dropped.head(5)
| Season | Date | Opponent | W/L | Score | G | A | PTS | SH | PCT | NO/MIN | PP | SH.1 | EN | GW | GTG | UA | BLK | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020 | 12/11/20 | at Boston College | W | 4-1 | 4 | 7 | 11 | 36 | 0.111 | 2-4 | 0 | 0 | 0 | 1 | 0 | 0 | 10 | 
| 1 | 2020 | 12/13/20 | Boston College | L | 1-2 | 1 | 2 | 3 | 33 | 0.030 | 5-10 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 
| 2 | 2020 | 01/03/21 | Maine | W | 3-0 | 3 | 3 | 6 | 47 | 0.064 | 1-2 | 0 | 0 | 0 | 1 | 0 | 1 | 6 | 
| 3 | 2020 | 01/04/21 | Maine | W | 1-0 | 1 | 1 | 2 | 41 | 0.024 | 4-8 | 0 | 0 | 0 | 1 | 0 | 0 | 15 | 
| 4 | 2020 | 01/08/21 | New Hampshire | W | 3-1 | 3 | 3 | 6 | 42 | 0.071 | 3-6 | 0 | 1 | 1 | 1 | 0 | 0 | 16 | 
# Try to drop a column with axis = 0? Nope.
df_axis = df.drop("PP", axis = 0)
df_axis.head(5)
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) /var/folders/81/695611dx35n9wp1vjkd8y8h00000gn/T/ipykernel_90377/2564734204.py in <module> 1 # Try to drop a column with axis = 0? Nope. ----> 2 df_axis = df.drop("PP", axis = 0) 3 df_axis.head(5) /opt/anaconda3/lib/python3.9/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs) 309 stacklevel=stacklevel, 310 ) --> 311 return func(*args, **kwargs) 312 313 return wrapper /opt/anaconda3/lib/python3.9/site-packages/pandas/core/frame.py in drop(self, labels, axis, index, columns, level, inplace, errors) 4904 weight 1.0 0.8 4905 """ -> 4906 return super().drop( 4907 labels=labels, 4908 axis=axis, /opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py in drop(self, labels, axis, index, columns, level, inplace, errors) 4148 for axis, labels in axes.items(): 4149 if labels is not None: -> 4150 obj = obj._drop_axis(labels, axis, level=level, errors=errors) 4151 4152 if inplace: /opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py in _drop_axis(self, labels, axis, level, errors) 4183 new_axis = axis.drop(labels, level=level, errors=errors) 4184 else: -> 4185 new_axis = axis.drop(labels, errors=errors) 4186 result = self.reindex(**{axis_name: new_axis}) 4187 /opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexes/base.py in drop(self, labels, errors) 6015 if mask.any(): 6016 if errors != "ignore": -> 6017 raise KeyError(f"{labels[mask]} not found in axis") 6018 indexer = indexer[~mask] 6019 return self.delete(indexer) KeyError: "['PP'] not found in axis"
# Try to drop a row with axis = 0? Now we're talking
df_row_dropped = df.drop(0, axis = 0)
df_row_dropped.head(5)
| Season | Date | Opponent | W/L | Score | G | A | PTS | SH | PCT | NO/MIN | PP | SH.1 | EN | GW | GTG | HT | UA | BLK | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 2020 | 12/13/20 | Boston College | L | 1-2 | 1 | 2 | 3 | 33 | 0.030 | 5-10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 
| 2 | 2020 | 01/03/21 | Maine | W | 3-0 | 3 | 3 | 6 | 47 | 0.064 | 1-2 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 6 | 
| 3 | 2020 | 01/04/21 | Maine | W | 1-0 | 1 | 1 | 2 | 41 | 0.024 | 4-8 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 15 | 
| 4 | 2020 | 01/08/21 | New Hampshire | W | 3-1 | 3 | 3 | 6 | 42 | 0.071 | 3-6 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 16 | 
| 5 | 2020 | 01/09/21 | at New Hampshire | T | 2-2 | 2 | 3 | 5 | 32 | 0.063 | 3-6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 
# Collect all the rows where we won (returns another dataframe)
wins = df.loc[(df["W/L"] == "W")]
wins.head(5)
# Iterate over all the rows. Let's look at Opponent in each row
for i in range(len(df)):
    print(df.loc[i, "Opponent"])
# Add a new column that's the sum of two other columns
df["new col"] = df["G"] + df["GW"]
df.head()
| Season | Date | Opponent | W/L | Score | G | A | PTS | SH | PCT | NO/MIN | PP | SH.1 | EN | GW | GTG | HT | UA | BLK | new col | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020 | 12/11/20 | at Boston College | W | 4-1 | 4 | 7 | 11 | 36 | 0.111 | 2-4 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 10 | 5 | 
| 1 | 2020 | 12/13/20 | Boston College | L | 1-2 | 1 | 2 | 3 | 33 | 0.030 | 5-10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1 | 
| 2 | 2020 | 01/03/21 | Maine | W | 3-0 | 3 | 3 | 6 | 47 | 0.064 | 1-2 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 6 | 4 | 
| 3 | 2020 | 01/04/21 | Maine | W | 1-0 | 1 | 1 | 2 | 41 | 0.024 | 4-8 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 15 | 2 | 
| 4 | 2020 | 01/08/21 | New Hampshire | W | 3-1 | 3 | 3 | 6 | 42 | 0.071 | 3-6 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 16 | 4 |