saved notebook

rhodyprog4ds · Feb 15, 2023 · 48468f5 · 48468f5
1 parent 0c1ca2f
commit 48468f5
Showing 1 changed file with 171 additions and 0 deletions.
diff --git a/notes/2023-02-14.md b/notes/2023-02-14.md
@@ -0,0 +1,171 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.14.1
+kernelspec:
+  display_name: Python 3
+  language: python
+  name: python3
+---
+
+# Tidy Data and Reshaping Datasets
+
+```{code-cell} ipython3
+import pandas as pd
+import seaborn as sns
+
+sns.set_theme(palette='colorblind',font_scale=2)
+```
+
+```{code-cell} ipython3
+url_base = 'https://raw.githubusercontent.com/rhodyprog4ds/rhodyds/main/data/'
+
+datasets = ['study_a.csv','study_b.csv','study_c.csv']
+```
+
+```{code-cell} ipython3
+list_of_df = [pd.read_csv(url_base + dataset,na_values='-') for dataset in datasets]
+```
+
+```{code-cell} ipython3
+list_of_df[0]
+```
+
+```{code-cell} ipython3
+list_of_df[1]
+```
+
+```{code-cell} ipython3
+list_of_df[2]
+```
+
+```{code-cell} ipython3
+list_of_df[2].mean()
+```
+
+```{code-cell} ipython3
+sum([16,3,2,11,1])/5
+```
+
+```{code-cell} ipython3
+sum([16,3,2,11,1,0])/6
+```
+
+```{code-cell} ipython3
+list_of_df[2].groupby('treatment').mean()
+```
+
+```{code-cell} ipython3
+list_of_df[2].groupby('person').mean()
+```
+
+```{code-cell} ipython3
+dfa = list_of_df[0]
+dfa
+```
+
+```{code-cell} ipython3
+dfa.melt(id_vars=['name'],var_name='treatment',value_name='result')
+```
+
+```{code-cell} ipython3
+arabica_data_url = 'https://raw.githubusercontent.com/jldbc/coffee-quality-database/master/data/arabica_data_cleaned.csv'
+# load the data
+coffee_df = pd.read_csv(arabica_data_url)
+# get total bags per country
+bags_per_country = coffee_df.groupby('Country.of.Origin')['Number.of.Bags'].sum()
+
+# sort descending, keep only the top 10 and pick out only the country names
+top_bags_country_list = bags_per_country.sort_values(ascending=False)[:10].index
+
+# filter the original data for only the countries in the top list
+top_coffee_df = coffee_df[coffee_df['Country.of.Origin'].isin(top_bags_country_list)]
+```
+
+```{code-cell} ipython3
+bags_per_country
+```
+
+```{code-cell} ipython3
+top_bags_country_list
+```
+
+```{code-cell} ipython3
+top_coffee_df.head(1)
+```
+
+```{code-cell} ipython3
+coffee_df.head(1)
+```
+
+```{code-cell} ipython3
+coffee_df.shape,top_coffee_df.shape
+```
+
+```{code-cell} ipython3
+top_coffee_df.describe()
+```
+
+```{code-cell} ipython3
+top_coffee_df.columns
+```
+
+```{code-cell} ipython3
+ratings_of_interest = ['Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body',
+       'Balance', ]
+coffe_scores_df = top_coffee_df.melt(id_vars='Country.of.Origin',value_vars=ratings_of_interest,
+                   var_name='rating',value_name='score')
+coffe_scores_df.head(1)
+```
+
+```{code-cell} ipython3
+top_coffee_df.melt(id_vars='Country.of.Origin')['variable'].unique()
+```
+
+```{code-cell} ipython3
+top_coffee_df.melt(id_vars='Country.of.Origin',value_vars=ratings_of_interest,)['variable'].unique()
+```
+
+```{code-cell} ipython3
+%matplotlib inline
+```
+
+```{code-cell} ipython3
+sns.displot(data=coffe_scores_df, x='score',col='Country.of.Origin',
+           hue = 'rating',col_wrap=5,kind='kde')
+```
+
+```{code-cell} ipython3
+sns.displot(data=coffe_scores_df, x='score',hue='Country.of.Origin',
+           col = 'rating',col_wrap=3,kind='kde')
+```
+
+```{code-cell} ipython3
+top_coffee_df.columns
+```
+
+```{code-cell} ipython3
+coffe_scores_df2= top_coffee_df.melt(id_vars=['Country.of.Origin','Color'],value_vars=ratings_of_interest,
+                   var_name='rating',value_name='score')
+coffe_scores_df2.head(1)
+```
+
+```{code-cell} ipython3
+sns.displot(data=coffe_scores_df2, x='score',hue='Country.of.Origin',
+           col = 'rating',row='Color',kind='kde')
+```
+
+```{code-cell} ipython3
+
+```
+
+```{code-cell} ipython3
+
+```
+
+```{code-cell} ipython3
+
+```