%matplotlib inline
!pip install country_converter
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import country_converter as coco
import plotly.offline as po
import plotly.graph_objs as pg
Requirement already satisfied: country_converter in /home/josh/anaconda3/lib/python3.9/site-packages (0.7.7) Requirement already satisfied: pandas>=1.0 in /home/josh/anaconda3/lib/python3.9/site-packages (from country_converter) (1.4.2) Requirement already satisfied: python-dateutil>=2.8.1 in /home/josh/anaconda3/lib/python3.9/site-packages (from pandas>=1.0->country_converter) (2.8.2) Requirement already satisfied: pytz>=2020.1 in /home/josh/anaconda3/lib/python3.9/site-packages (from pandas>=1.0->country_converter) (2021.3) Requirement already satisfied: numpy>=1.18.5 in /home/josh/anaconda3/lib/python3.9/site-packages (from pandas>=1.0->country_converter) (1.21.5) Requirement already satisfied: six>=1.5 in /home/josh/anaconda3/lib/python3.9/site-packages (from python-dateutil>=2.8.1->pandas>=1.0->country_converter) (1.16.0)
data_science_jobs = pd.read_csv('./data/ds_salaries.csv');
data_science_jobs.head()
Unnamed: 0 | work_year | experience_level | employment_type | job_title | salary | salary_currency | salary_in_usd | employee_residence | remote_ratio | company_location | company_size | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 2020 | MI | FT | Data Scientist | 70000 | EUR | 79833 | DE | 0 | DE | L |
1 | 1 | 2020 | SE | FT | Machine Learning Scientist | 260000 | USD | 260000 | JP | 0 | JP | S |
2 | 2 | 2020 | SE | FT | Big Data Engineer | 85000 | GBP | 109024 | GB | 50 | GB | M |
3 | 3 | 2020 | MI | FT | Product Data Analyst | 20000 | USD | 20000 | HN | 0 | HN | S |
4 | 4 | 2020 | SE | FT | Machine Learning Engineer | 150000 | USD | 150000 | US | 50 | US | L |
data_science_jobs.drop('Unnamed: 0', axis=1, inplace=True)
data_science_jobs.head()
work_year | experience_level | employment_type | job_title | salary | salary_currency | salary_in_usd | employee_residence | remote_ratio | company_location | company_size | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020 | MI | FT | Data Scientist | 70000 | EUR | 79833 | DE | 0 | DE | L |
1 | 2020 | SE | FT | Machine Learning Scientist | 260000 | USD | 260000 | JP | 0 | JP | S |
2 | 2020 | SE | FT | Big Data Engineer | 85000 | GBP | 109024 | GB | 50 | GB | M |
3 | 2020 | MI | FT | Product Data Analyst | 20000 | USD | 20000 | HN | 0 | HN | S |
4 | 2020 | SE | FT | Machine Learning Engineer | 150000 | USD | 150000 | US | 50 | US | L |
data_science_jobs.dtypes
work_year int64 experience_level object employment_type object job_title object salary int64 salary_currency object salary_in_usd int64 employee_residence object remote_ratio int64 company_location object company_size object dtype: object
data_science_jobs['work_year'].value_counts()
2022 318 2021 217 2020 72 Name: work_year, dtype: int64
data_scientist_frame = data_science_jobs[data_science_jobs['job_title']=='Data Scientist']
data_scientist_frame.head()
work_year | experience_level | employment_type | job_title | salary | salary_currency | salary_in_usd | employee_residence | remote_ratio | company_location | company_size | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020 | MI | FT | Data Scientist | 70000 | EUR | 79833 | DE | 0 | DE | L |
7 | 2020 | MI | FT | Data Scientist | 11000000 | HUF | 35735 | HU | 50 | HU | L |
10 | 2020 | EN | FT | Data Scientist | 45000 | EUR | 51321 | FR | 0 | FR | S |
11 | 2020 | MI | FT | Data Scientist | 3000000 | INR | 40481 | IN | 0 | IN | L |
12 | 2020 | EN | FT | Data Scientist | 35000 | EUR | 39916 | FR | 0 | FR | M |
# Set the style
plt.style.use('seaborn-whitegrid')
# OO method from scratch
fig, ax = plt.subplots(figsize=(10, 6))
# Plot the data
horizontal_bar = ax.barh(data_scientist_frame['experience_level'],
data_scientist_frame['salary_in_usd'])
# Customize the plot
ax.set(title='Experience Level & Salary for a Data Scientist',
xlabel='Salary (USD)',
ylabel='Experience Level',
);
ds_salary_2021 = data_scientist_frame[data_scientist_frame['work_year']==2021]
ds_salary_2021.head()
work_year | experience_level | employment_type | job_title | salary | salary_currency | salary_in_usd | employee_residence | remote_ratio | company_location | company_size | |
---|---|---|---|---|---|---|---|---|---|---|---|
75 | 2021 | SE | FT | Data Scientist | 45000 | EUR | 53192 | FR | 50 | FR | L |
94 | 2021 | EN | FT | Data Scientist | 2200000 | INR | 29751 | IN | 50 | IN | L |
104 | 2021 | MI | FT | Data Scientist | 73000 | USD | 73000 | US | 0 | US | L |
116 | 2021 | MI | FT | Data Scientist | 50000 | USD | 50000 | NG | 100 | NG | L |
127 | 2021 | MI | FT | Data Scientist | 700000 | INR | 9466 | IN | 0 | IN | S |
# Set the style
plt.style.use('seaborn-whitegrid')
# OO method from scratch
fig, ax = plt.subplots(figsize=(10, 6))
# Plot the data
horizontal_bar = ax.barh(ds_salary_2021['experience_level'],
ds_salary_2021['salary_in_usd'],
color='green')
# Customize the plot
ax.set(title='Experience Level & Salary for a Data Scientist in 2021',
xlabel='Salary (USD)',
ylabel='Experience Level',
);
ds_salary_2022 = data_scientist_frame[data_scientist_frame['work_year']==2022]
ds_salary_2022.head()
work_year | experience_level | employment_type | job_title | salary | salary_currency | salary_in_usd | employee_residence | remote_ratio | company_location | company_size | |
---|---|---|---|---|---|---|---|---|---|---|---|
292 | 2022 | MI | FT | Data Scientist | 130000 | USD | 130000 | US | 0 | US | M |
293 | 2022 | MI | FT | Data Scientist | 90000 | USD | 90000 | US | 0 | US | M |
298 | 2022 | SE | FT | Data Scientist | 136620 | USD | 136620 | US | 100 | US | M |
299 | 2022 | SE | FT | Data Scientist | 99360 | USD | 99360 | US | 100 | US | M |
300 | 2022 | SE | FT | Data Scientist | 90000 | GBP | 117789 | GB | 0 | GB | M |
# Set the style
plt.style.use('seaborn-whitegrid')
# OO method from scratch
fig, ax = plt.subplots(figsize=(10, 6))
# Plot the data
horizontal_bar = ax.barh(ds_salary_2022['experience_level'],
ds_salary_2022['salary_in_usd'],
color='yellow')
# Customize the plot
ax.set(title='Experience Level & Salary for a Data Scientist in 2022',
xlabel='Salary (USD)',
ylabel='Experience Level',
);
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(10, 6))
# Plot the data in ax1
horizontal_bar = ax1.barh(ds_salary_2022['experience_level'],
ds_salary_2022['salary_in_usd'],
color='blue')
# Customize the plot in ax1
ax1.set(title='Experience Level & Salary for a Data Scientist in 2022',
xlabel='Salary (USD)',
ylabel='Experience Level',
);
# Plot the data in ax2
horizontal_bar = ax2.barh(ds_salary_2021['experience_level'],
ds_salary_2021['salary_in_usd'],
color='green')
# Customize the plot in ax2
ax2.set(title='Experience Level & Salary for a Data Scientist in 2021',
xlabel='Salary (USD)',
ylabel='Experience Level',
);
ml_frame = data_science_jobs[data_science_jobs['job_title']=='Machine Learning Engineer']
ml_frame.head()
work_year | experience_level | employment_type | job_title | salary | salary_currency | salary_in_usd | employee_residence | remote_ratio | company_location | company_size | |
---|---|---|---|---|---|---|---|---|---|---|---|
4 | 2020 | SE | FT | Machine Learning Engineer | 150000 | USD | 150000 | US | 50 | US | L |
20 | 2020 | MI | FT | Machine Learning Engineer | 299000 | CNY | 43331 | CN | 0 | CN | M |
37 | 2020 | EN | FT | Machine Learning Engineer | 250000 | USD | 250000 | US | 50 | US | L |
39 | 2020 | EN | FT | Machine Learning Engineer | 138000 | USD | 138000 | US | 100 | US | S |
64 | 2020 | SE | FT | Machine Learning Engineer | 40000 | EUR | 45618 | HR | 100 | HR | S |
# Set the style
plt.style.use('seaborn-whitegrid')
# OO method from scratch
fig, ax = plt.subplots(figsize=(10, 6))
# Plot the data
horizontal_bar = ax.barh(ml_frame['experience_level'],
ml_frame['salary_in_usd'])
# Customize the plot
ax.set(title='Experience Level & Salary for a Machine Learning Engineer',
xlabel='Salary (USD)',
ylabel='Experience Level',
);
ml_salary_2021 = ml_frame[ml_frame['work_year']==2021]
ml_salary_2021.head()
work_year | experience_level | employment_type | job_title | salary | salary_currency | salary_in_usd | employee_residence | remote_ratio | company_location | company_size | |
---|---|---|---|---|---|---|---|---|---|---|---|
83 | 2021 | MI | FT | Machine Learning Engineer | 40000 | EUR | 47282 | ES | 100 | ES | S |
110 | 2021 | SE | FT | Machine Learning Engineer | 80000 | EUR | 94564 | DE | 50 | DE | L |
128 | 2021 | EN | FT | Machine Learning Engineer | 20000 | USD | 20000 | IN | 100 | IN | S |
145 | 2021 | SE | FT | Machine Learning Engineer | 70000 | EUR | 82744 | BE | 50 | BE | M |
159 | 2021 | EN | FT | Machine Learning Engineer | 125000 | USD | 125000 | US | 100 | US | S |
# Set the style
plt.style.use('seaborn-whitegrid')
# OO method from scratch
fig, ax = plt.subplots(figsize=(10, 6))
# Plot the data
horizontal_bar = ax.barh(ml_salary_2021['experience_level'],
ml_salary_2021['salary_in_usd'],
color='green')
# Customize the plot
ax.set(title='Experience Level & Salary for a Machine Learning Engineer in 2021',
xlabel='Salary (USD)',
ylabel='Experience Level',
);
ml_salary_2022 = ml_frame[ml_frame['work_year']==2022]
ml_salary_2022.head()
work_year | experience_level | employment_type | job_title | salary | salary_currency | salary_in_usd | employee_residence | remote_ratio | company_location | company_size | |
---|---|---|---|---|---|---|---|---|---|---|---|
371 | 2022 | SE | FT | Machine Learning Engineer | 189650 | USD | 189650 | US | 0 | US | M |
372 | 2022 | SE | FT | Machine Learning Engineer | 164996 | USD | 164996 | US | 0 | US | M |
386 | 2022 | EN | FT | Machine Learning Engineer | 28500 | GBP | 37300 | GB | 100 | GB | L |
389 | 2022 | MI | FT | Machine Learning Engineer | 95000 | GBP | 124333 | GB | 0 | GB | M |
390 | 2022 | MI | FT | Machine Learning Engineer | 75000 | GBP | 98158 | GB | 0 | GB | M |
# Set the style
plt.style.use('seaborn-whitegrid')
# OO method from scratch
fig, ax = plt.subplots(figsize=(10, 6))
# Plot the data
horizontal_bar = ax.barh(ml_salary_2022['experience_level'],
ml_salary_2022['salary_in_usd'],
color='green')
# Customize the plot
ax.set(title='Experience Level & Salary for a Machine Learning Engineer in 2022',
xlabel='Salary (USD)',
ylabel='Experience Level',
);
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(15, 6))
# Plot the data in ax1
horizontal_bar = ax1.barh(ml_salary_2022['experience_level'],
ml_salary_2022['salary_in_usd'],
color='blue')
# Customize the plot in ax1
ax1.set(title='Experience Level & Salary for a Machine Learning Engineer in 2022',
xlabel='Salary (USD)',
ylabel='Experience Level',
);
# Plot the data in ax2
horizontal_bar = ax2.barh(ml_salary_2021['experience_level'],
ml_salary_2021['salary_in_usd'],
color='green')
# Customize the plot in ax2
ax2.set(title='Experience Level & Salary for a Machine Learning Engineer in 2021',
xlabel='Salary (USD)',
ylabel='Experience Level',
);
plt.figure(figsize=(10, 6))
sns.set_style('darkgrid')
sns.histplot(x=data_science_jobs['salary_in_usd'], hue=data_science_jobs['work_year'], palette='Set1', kde=True);
data_fresher = data_science_jobs[data_science_jobs['experience_level']=='EN']
top_pay_roles = data_fresher.groupby('job_title').median().sort_values(by='salary_in_usd', ascending=False)
least_pay_roles = data_fresher.groupby('job_title').median().sort_values(by='salary_in_usd', ascending=True)
values_roles_top = list(top_pay_roles.iloc[:5]['salary_in_usd'].values)
names_top = list(top_pay_roles.iloc[:5].index)
values_roles_least = list(least_pay_roles.iloc[:5]['salary_in_usd'].values)
names_least = list(least_pay_roles.iloc[:5].index)
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.set_style('darkgrid')
sns.barplot(x = names_top,y = values_roles_top,palette='magma')
plt.xticks(rotation='90')
plt.title('Top pay roles')
plt.subplot(1,2,2)
sns.set_style('whitegrid')
sns.barplot(x = names_least,y = values_roles_least,palette='crest')
plt.xticks(rotation='90')
plt.title('Least pay roles')
plt.show()
data_science_jobs.head()
work_year | experience_level | employment_type | job_title | salary | salary_currency | salary_in_usd | employee_residence | remote_ratio | company_location | company_size | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020 | MI | FT | Data Scientist | 70000 | EUR | 79833 | DE | 0 | DE | L |
1 | 2020 | SE | FT | Machine Learning Scientist | 260000 | USD | 260000 | JP | 0 | JP | S |
2 | 2020 | SE | FT | Big Data Engineer | 85000 | GBP | 109024 | GB | 50 | GB | M |
3 | 2020 | MI | FT | Product Data Analyst | 20000 | USD | 20000 | HN | 0 | HN | S |
4 | 2020 | SE | FT | Machine Learning Engineer | 150000 | USD | 150000 | US | 50 | US | L |
converted_country = coco.convert(names=data_science_jobs.groupby('company_location').count().sort_values(by='work_year',
ascending=False).index, to='ISO3')
data= dict(type='choropleth',
colorscale='magenta',
locations= converted_country,
z = data_science_jobs.groupby('company_location').count().sort_values(by='work_year', ascending=False)['work_year'])
layout = dict(title='Companies By Country',
geo= dict(projection={'type':'robinson'},
showlakes=False))
x= pg.Figure(data = [data],
layout = layout)
po.iplot(x)
converted_country = coco.convert(names=data_science_jobs.groupby('company_location').mean().sort_values(by='salary_in_usd',
ascending=False).index, to='ISO3')
data= dict(type='choropleth',
colorscale='rdbu',
locations= converted_country,
z = data_science_jobs.groupby('company_location').mean().sort_values(by='salary_in_usd', ascending=False)['salary_in_usd'])
layout = dict(title='Salaries By Country',
geo= dict(projection={'type':'robinson'},
showlakes=False))
x= pg.Figure(data = [data],
layout = layout)
po.iplot(x)