%matplotlib inline
!pip install country_converter
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import country_converter as coco
import plotly.offline as po
import plotly.graph_objs as pg

Requirement already satisfied: country_converter in /home/josh/anaconda3/lib/python3.9/site-packages (0.7.7)
Requirement already satisfied: pandas>=1.0 in /home/josh/anaconda3/lib/python3.9/site-packages (from country_converter) (1.4.2)
Requirement already satisfied: python-dateutil>=2.8.1 in /home/josh/anaconda3/lib/python3.9/site-packages (from pandas>=1.0->country_converter) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /home/josh/anaconda3/lib/python3.9/site-packages (from pandas>=1.0->country_converter) (2021.3)
Requirement already satisfied: numpy>=1.18.5 in /home/josh/anaconda3/lib/python3.9/site-packages (from pandas>=1.0->country_converter) (1.21.5)
Requirement already satisfied: six>=1.5 in /home/josh/anaconda3/lib/python3.9/site-packages (from python-dateutil>=2.8.1->pandas>=1.0->country_converter) (1.16.0)


data_science_jobs = pd.read_csv('./data/ds_salaries.csv');
data_science_jobs.head()


data_science_jobs.drop('Unnamed: 0', axis=1, inplace=True)
data_science_jobs.head()


data_science_jobs.dtypes

work_year              int64
experience_level      object
employment_type       object
job_title             object
salary                 int64
salary_currency       object
salary_in_usd          int64
employee_residence    object
remote_ratio           int64
company_location      object
company_size          object
dtype: object


data_science_jobs['work_year'].value_counts()

2022    318
2021    217
2020     72
Name: work_year, dtype: int64


data_scientist_frame = data_science_jobs[data_science_jobs['job_title']=='Data Scientist']
data_scientist_frame.head()


# Set the style
plt.style.use('seaborn-whitegrid')

# OO method from scratch
fig, ax = plt.subplots(figsize=(10, 6))

# Plot the data
horizontal_bar = ax.barh(data_scientist_frame['experience_level'],
                         data_scientist_frame['salary_in_usd'])


# Customize the plot
ax.set(title='Experience Level & Salary for a Data Scientist',
      xlabel='Salary (USD)',
      ylabel='Experience Level',
      );


ds_salary_2021 = data_scientist_frame[data_scientist_frame['work_year']==2021]
ds_salary_2021.head()


# Set the style
plt.style.use('seaborn-whitegrid')

# OO method from scratch
fig, ax = plt.subplots(figsize=(10, 6))

# Plot the data
horizontal_bar = ax.barh(ds_salary_2021['experience_level'],
                         ds_salary_2021['salary_in_usd'],
                         color='green')


# Customize the plot
ax.set(title='Experience Level & Salary for a Data Scientist in 2021',
      xlabel='Salary (USD)',
      ylabel='Experience Level',
      );


ds_salary_2022 = data_scientist_frame[data_scientist_frame['work_year']==2022]
ds_salary_2022.head()


# Set the style
plt.style.use('seaborn-whitegrid')

# OO method from scratch
fig, ax = plt.subplots(figsize=(10, 6))

# Plot the data
horizontal_bar = ax.barh(ds_salary_2022['experience_level'],
                         ds_salary_2022['salary_in_usd'],
                         color='yellow')


# Customize the plot
ax.set(title='Experience Level & Salary for a Data Scientist in 2022',
      xlabel='Salary (USD)',
      ylabel='Experience Level',
      );


fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(10, 6))

# Plot the data in ax1
horizontal_bar = ax1.barh(ds_salary_2022['experience_level'],
                         ds_salary_2022['salary_in_usd'],
                         color='blue')


# Customize the plot in ax1
ax1.set(title='Experience Level & Salary for a Data Scientist in 2022',
      xlabel='Salary (USD)',
      ylabel='Experience Level',
      );

# Plot the data in ax2
horizontal_bar = ax2.barh(ds_salary_2021['experience_level'],
                         ds_salary_2021['salary_in_usd'],
                         color='green')


# Customize the plot in ax2
ax2.set(title='Experience Level & Salary for a Data Scientist in 2021',
      xlabel='Salary (USD)',
      ylabel='Experience Level',
      );


ml_frame = data_science_jobs[data_science_jobs['job_title']=='Machine Learning Engineer']
ml_frame.head()


# Set the style
plt.style.use('seaborn-whitegrid')

# OO method from scratch
fig, ax = plt.subplots(figsize=(10, 6))

# Plot the data
horizontal_bar = ax.barh(ml_frame['experience_level'],
                         ml_frame['salary_in_usd'])


# Customize the plot
ax.set(title='Experience Level & Salary for a Machine Learning Engineer',
      xlabel='Salary (USD)',
      ylabel='Experience Level',
      );


ml_salary_2021 = ml_frame[ml_frame['work_year']==2021]
ml_salary_2021.head()


# Set the style
plt.style.use('seaborn-whitegrid')

# OO method from scratch
fig, ax = plt.subplots(figsize=(10, 6))

# Plot the data
horizontal_bar = ax.barh(ml_salary_2021['experience_level'],
                         ml_salary_2021['salary_in_usd'],
                         color='green')


# Customize the plot
ax.set(title='Experience Level & Salary for a Machine Learning Engineer in 2021',
      xlabel='Salary (USD)',
      ylabel='Experience Level',
      );


ml_salary_2022 = ml_frame[ml_frame['work_year']==2022]
ml_salary_2022.head()


# Set the style
plt.style.use('seaborn-whitegrid')

# OO method from scratch
fig, ax = plt.subplots(figsize=(10, 6))

# Plot the data
horizontal_bar = ax.barh(ml_salary_2022['experience_level'],
                         ml_salary_2022['salary_in_usd'],
                         color='green')


# Customize the plot
ax.set(title='Experience Level & Salary for a Machine Learning Engineer in 2022',
      xlabel='Salary (USD)',
      ylabel='Experience Level',
      );


fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(15, 6))

# Plot the data in ax1
horizontal_bar = ax1.barh(ml_salary_2022['experience_level'],
                         ml_salary_2022['salary_in_usd'],
                         color='blue')


# Customize the plot in ax1
ax1.set(title='Experience Level & Salary for a Machine Learning Engineer in 2022',
      xlabel='Salary (USD)',
      ylabel='Experience Level',
      );

# Plot the data in ax2
horizontal_bar = ax2.barh(ml_salary_2021['experience_level'],
                         ml_salary_2021['salary_in_usd'],
                         color='green')


# Customize the plot in ax2
ax2.set(title='Experience Level & Salary for a Machine Learning Engineer in 2021',
      xlabel='Salary (USD)',
      ylabel='Experience Level',
      );


plt.figure(figsize=(10, 6))
sns.set_style('darkgrid')
sns.histplot(x=data_science_jobs['salary_in_usd'], hue=data_science_jobs['work_year'], palette='Set1', kde=True);


data_fresher = data_science_jobs[data_science_jobs['experience_level']=='EN']


top_pay_roles = data_fresher.groupby('job_title').median().sort_values(by='salary_in_usd', ascending=False)
least_pay_roles = data_fresher.groupby('job_title').median().sort_values(by='salary_in_usd', ascending=True)


values_roles_top = list(top_pay_roles.iloc[:5]['salary_in_usd'].values)
names_top = list(top_pay_roles.iloc[:5].index)

values_roles_least = list(least_pay_roles.iloc[:5]['salary_in_usd'].values)
names_least = list(least_pay_roles.iloc[:5].index)


plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.set_style('darkgrid')
sns.barplot(x = names_top,y = values_roles_top,palette='magma')
plt.xticks(rotation='90')
plt.title('Top pay roles')
plt.subplot(1,2,2)
sns.set_style('whitegrid')
sns.barplot(x = names_least,y = values_roles_least,palette='crest')
plt.xticks(rotation='90')
plt.title('Least pay roles')
plt.show()


data_science_jobs.head()


converted_country = coco.convert(names=data_science_jobs.groupby('company_location').count().sort_values(by='work_year', 
                                                                             ascending=False).index, to='ISO3')
data= dict(type='choropleth',
          colorscale='magenta',
          locations= converted_country,
          z = data_science_jobs.groupby('company_location').count().sort_values(by='work_year', ascending=False)['work_year'])
layout = dict(title='Companies By Country',
             geo= dict(projection={'type':'robinson'},
                      showlakes=False))
x= pg.Figure(data = [data],
            layout = layout)
po.iplot(x)


converted_country = coco.convert(names=data_science_jobs.groupby('company_location').mean().sort_values(by='salary_in_usd', 
                                                                             ascending=False).index, to='ISO3')
data= dict(type='choropleth',
          colorscale='rdbu',
          locations= converted_country,
          z = data_science_jobs.groupby('company_location').mean().sort_values(by='salary_in_usd', ascending=False)['salary_in_usd'])
layout = dict(title='Salaries By Country',
             geo= dict(projection={'type':'robinson'},
                      showlakes=False))
x= pg.Figure(data = [data],
            layout = layout)
po.iplot(x)

	Unnamed: 0	work_year	experience_level	employment_type	job_title	salary	salary_currency	salary_in_usd	employee_residence	remote_ratio	company_location	company_size
0	0	2020	MI	FT	Data Scientist	70000	EUR	79833	DE	0	DE	L
1	1	2020	SE	FT	Machine Learning Scientist	260000	USD	260000	JP	0	JP	S
2	2	2020	SE	FT	Big Data Engineer	85000	GBP	109024	GB	50	GB	M
3	3	2020	MI	FT	Product Data Analyst	20000	USD	20000	HN	0	HN	S
4	4	2020	SE	FT	Machine Learning Engineer	150000	USD	150000	US	50	US	L

	work_year	experience_level	employment_type	job_title	salary	salary_currency	salary_in_usd	employee_residence	remote_ratio	company_location	company_size
0	2020	MI	FT	Data Scientist	70000	EUR	79833	DE	0	DE	L
1	2020	SE	FT	Machine Learning Scientist	260000	USD	260000	JP	0	JP	S
2	2020	SE	FT	Big Data Engineer	85000	GBP	109024	GB	50	GB	M
3	2020	MI	FT	Product Data Analyst	20000	USD	20000	HN	0	HN	S
4	2020	SE	FT	Machine Learning Engineer	150000	USD	150000	US	50	US	L

	work_year	experience_level	employment_type	job_title	salary	salary_currency	salary_in_usd	employee_residence	remote_ratio	company_location	company_size
0	2020	MI	FT	Data Scientist	70000	EUR	79833	DE	0	DE	L
7	2020	MI	FT	Data Scientist	11000000	HUF	35735	HU	50	HU	L
10	2020	EN	FT	Data Scientist	45000	EUR	51321	FR	0	FR	S
11	2020	MI	FT	Data Scientist	3000000	INR	40481	IN	0	IN	L
12	2020	EN	FT	Data Scientist	35000	EUR	39916	FR	0	FR	M

	work_year	experience_level	employment_type	job_title	salary	salary_currency	salary_in_usd	employee_residence	remote_ratio	company_location	company_size
75	2021	SE	FT	Data Scientist	45000	EUR	53192	FR	50	FR	L
94	2021	EN	FT	Data Scientist	2200000	INR	29751	IN	50	IN	L
104	2021	MI	FT	Data Scientist	73000	USD	73000	US	0	US	L
116	2021	MI	FT	Data Scientist	50000	USD	50000	NG	100	NG	L
127	2021	MI	FT	Data Scientist	700000	INR	9466	IN	0	IN	S

	work_year	experience_level	employment_type	job_title	salary	salary_currency	salary_in_usd	employee_residence	remote_ratio	company_location	company_size
83	2021	MI	FT	Machine Learning Engineer	40000	EUR	47282	ES	100	ES	S
110	2021	SE	FT	Machine Learning Engineer	80000	EUR	94564	DE	50	DE	L
128	2021	EN	FT	Machine Learning Engineer	20000	USD	20000	IN	100	IN	S
145	2021	SE	FT	Machine Learning Engineer	70000	EUR	82744	BE	50	BE	M
159	2021	EN	FT	Machine Learning Engineer	125000	USD	125000	US	100	US	S

DataScience Jobs¶

Reading data¶

Find datatypes in dataframe¶

Exploring Work Year Dataset¶

Data Scientist Jobs & Salary¶

Data Scientist Salary in 2021¶

Data Scientist Salary in 2022¶

Data Scientist Salary in 2021 & 2022¶

Machine Learning Engineer & Salary¶

Machine Learning Engineer Salary in 2021¶

Machine Learning Engineer Salary in 2022¶

Machine Learning Engineer Salary in 2021 & 2022¶

Salary distribution variation across years¶

Top pay roles & least pay roles¶

Number of companies for each country¶

Salary for each country¶

	work_year	experience_level	employment_type	job_title	salary	salary_currency	salary_in_usd	employee_residence	remote_ratio	company_location	company_size
292	2022	MI	FT	Data Scientist	130000	USD	130000	US	0	US	M
293	2022	MI	FT	Data Scientist	90000	USD	90000	US	0	US	M
298	2022	SE	FT	Data Scientist	136620	USD	136620	US	100	US	M
299	2022	SE	FT	Data Scientist	99360	USD	99360	US	100	US	M
300	2022	SE	FT	Data Scientist	90000	GBP	117789	GB	0	GB	M

	work_year	experience_level	employment_type	job_title	salary	salary_currency	salary_in_usd	employee_residence	remote_ratio	company_location	company_size
371	2022	SE	FT	Machine Learning Engineer	189650	USD	189650	US	0	US	M
372	2022	SE	FT	Machine Learning Engineer	164996	USD	164996	US	0	US	M
386	2022	EN	FT	Machine Learning Engineer	28500	GBP	37300	GB	100	GB	L
389	2022	MI	FT	Machine Learning Engineer	95000	GBP	124333	GB	0	GB	M
390	2022	MI	FT	Machine Learning Engineer	75000	GBP	98158	GB	0	GB	M