Skip to content

Commit

Permalink
saved notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
brownsarahm committed Feb 15, 2023
1 parent 0c1ca2f commit 48468f5
Showing 1 changed file with 171 additions and 0 deletions.
171 changes: 171 additions & 0 deletions notes/2023-02-14.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
---
jupytext:
text_representation:
extension: .md
format_name: myst
format_version: 0.13
jupytext_version: 1.14.1
kernelspec:
display_name: Python 3
language: python
name: python3
---

# Tidy Data and Reshaping Datasets

```{code-cell} ipython3
import pandas as pd
import seaborn as sns
sns.set_theme(palette='colorblind',font_scale=2)
```

```{code-cell} ipython3
url_base = 'https://raw.githubusercontent.com/rhodyprog4ds/rhodyds/main/data/'
datasets = ['study_a.csv','study_b.csv','study_c.csv']
```

```{code-cell} ipython3
list_of_df = [pd.read_csv(url_base + dataset,na_values='-') for dataset in datasets]
```

```{code-cell} ipython3
list_of_df[0]
```

```{code-cell} ipython3
list_of_df[1]
```

```{code-cell} ipython3
list_of_df[2]
```

```{code-cell} ipython3
list_of_df[2].mean()
```

```{code-cell} ipython3
sum([16,3,2,11,1])/5
```

```{code-cell} ipython3
sum([16,3,2,11,1,0])/6
```

```{code-cell} ipython3
list_of_df[2].groupby('treatment').mean()
```

```{code-cell} ipython3
list_of_df[2].groupby('person').mean()
```

```{code-cell} ipython3
dfa = list_of_df[0]
dfa
```

```{code-cell} ipython3
dfa.melt(id_vars=['name'],var_name='treatment',value_name='result')
```

```{code-cell} ipython3
arabica_data_url = 'https://raw.githubusercontent.com/jldbc/coffee-quality-database/master/data/arabica_data_cleaned.csv'
# load the data
coffee_df = pd.read_csv(arabica_data_url)
# get total bags per country
bags_per_country = coffee_df.groupby('Country.of.Origin')['Number.of.Bags'].sum()
# sort descending, keep only the top 10 and pick out only the country names
top_bags_country_list = bags_per_country.sort_values(ascending=False)[:10].index
# filter the original data for only the countries in the top list
top_coffee_df = coffee_df[coffee_df['Country.of.Origin'].isin(top_bags_country_list)]
```

```{code-cell} ipython3
bags_per_country
```

```{code-cell} ipython3
top_bags_country_list
```

```{code-cell} ipython3
top_coffee_df.head(1)
```

```{code-cell} ipython3
coffee_df.head(1)
```

```{code-cell} ipython3
coffee_df.shape,top_coffee_df.shape
```

```{code-cell} ipython3
top_coffee_df.describe()
```

```{code-cell} ipython3
top_coffee_df.columns
```

```{code-cell} ipython3
ratings_of_interest = ['Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body',
'Balance', ]
coffe_scores_df = top_coffee_df.melt(id_vars='Country.of.Origin',value_vars=ratings_of_interest,
var_name='rating',value_name='score')
coffe_scores_df.head(1)
```

```{code-cell} ipython3
top_coffee_df.melt(id_vars='Country.of.Origin')['variable'].unique()
```

```{code-cell} ipython3
top_coffee_df.melt(id_vars='Country.of.Origin',value_vars=ratings_of_interest,)['variable'].unique()
```

```{code-cell} ipython3
%matplotlib inline
```

```{code-cell} ipython3
sns.displot(data=coffe_scores_df, x='score',col='Country.of.Origin',
hue = 'rating',col_wrap=5,kind='kde')
```

```{code-cell} ipython3
sns.displot(data=coffe_scores_df, x='score',hue='Country.of.Origin',
col = 'rating',col_wrap=3,kind='kde')
```

```{code-cell} ipython3
top_coffee_df.columns
```

```{code-cell} ipython3
coffe_scores_df2= top_coffee_df.melt(id_vars=['Country.of.Origin','Color'],value_vars=ratings_of_interest,
var_name='rating',value_name='score')
coffe_scores_df2.head(1)
```

```{code-cell} ipython3
sns.displot(data=coffe_scores_df2, x='score',hue='Country.of.Origin',
col = 'rating',row='Color',kind='kde')
```

```{code-cell} ipython3
```

```{code-cell} ipython3
```

```{code-cell} ipython3
```

0 comments on commit 48468f5

Please sign in to comment.