import pandas as pd

df = pd.read_csv('diabetes_balanced.csv')
df.head()


df.columns

Index(['Diabetes_binary', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')


data_dict = {'Diabetes_binary': 'diagnosis for prediabetes or diabetes', 
             'HighBP': 'presence of high blood pressure', 
             'HighChol': 'presence of high cholesterol',
             'CholCheck': 'cholestoral check in the past 5 years',
             'BMI': 'body mass index',
             'Smoker': 'smoked at least 100 cigarettes (5 packs) in lifetime',
             'Stroke': 'has experienced a stroke in lifetime',
             'HeartDiseaseorAttack': 'has experienced heart disease or heart attack in lifetime',
             'PhysActivity': 'completed physical activity in past 30 days (outside of profession)',
             'Fruits': 'consumes 1+ fruits per day',
             'Veggies': 'consumes 1+ vegetables per day',
             'HvyAlcoholConsump': 'heavy alcohol consumption (adult men >=14 drinks per week, adult women>=7 drinks per week)',
             'AnyHealthcare': 'has any form of health insurance',
             'NoDocbcCost': 'has not seen a doctor when needed in the past year because of cost',
             'GenHlth': 'self-analysis of health, 1 (excellent) - 5 (poor)',
             'MentHlth': 'days of poor mental health in past 30 days',
             'PhysHlth': 'days of physical illness or injury in past 30 days',
             'DiffWalk': 'has difficulty walking or climbing',
             'Sex': 'sex, 0 (female) or 1 (male)',
             'Age': '13 level age range, 1 (18-24 years) - 13 (80+ years)',
             'Education': '6 level education range, 1 (never attended school) - 6 (college graduate)',
             'Income': '8 level income range, 1 (<10,000) - 8 (>75,000)'}
data_dict

{'Diabetes_binary': 'diagnosis for prediabetes or diabetes',
 'HighBP': 'presence of high blood pressure',
 'HighChol': 'presence of high cholesterol',
 'CholCheck': 'cholestoral check in the past 5 years',
 'BMI': 'body mass index',
 'Smoker': 'smoked at least 100 cigarettes (5 packs) in lifetime',
 'Stroke': 'has experienced a stroke in lifetime',
 'HeartDiseaseorAttack': 'has experienced heart disease or heart attack in lifetime',
 'PhysActivity': 'completed physical activity in past 30 days (outside of profession)',
 'Fruits': 'consumes 1+ fruits per day',
 'Veggies': 'consumes 1+ vegetables per day',
 'HvyAlcoholConsump': 'heavy alcohol consumption (adult men >=14 drinks per week, adult women>=7 drinks per week)',
 'AnyHealthcare': 'has any form of health insurance',
 'NoDocbcCost': 'has not seen a doctor when needed in the past year because of cost',
 'GenHlth': 'self-analysis of health, 1 (excellent) - 5 (poor)',
 'MentHlth': 'days of poor mental health in past 30 days',
 'PhysHlth': 'days of physical illness or injury in past 30 days',
 'DiffWalk': 'has difficulty walking or climbing',
 'Sex': 'sex, 0 (female) or 1 (male)',
 'Age': '13 level age range, 1 (18-24 years) - 13 (80+ years)',
 'Education': '6 level education range, 1 (never attended school) - 6 (college graduate)',
 'Income': '8 level income range, 1 (<10,000) - 8 (>75,000)'}

	HighBP	HighChol	CholCheck	BMI	Smoker	Stroke	PhysActivity	Fruits	...	AnyHealthcare	GenHlth	MentHlth	PhysHlth	Sex	Age	Education	Income
0	1.0	0.0	1.0	26.0	0.0	0.0	1.0	0.0	...	1.0	3.0	5.0	30.0	1.0	4.0	6.0	8.0
1	1.0	1.0	1.0	26.0	1.0	1.0	0.0	1.0	...	1.0	3.0	0.0	0.0	1.0	12.0	6.0	8.0
2	0.0	0.0	1.0	26.0	0.0	0.0	1.0	1.0	...	1.0	1.0	0.0	10.0	1.0	13.0	6.0	8.0
3	1.0	1.0	1.0	28.0	1.0	0.0	1.0	1.0	...	1.0	3.0	0.0	3.0	1.0	11.0	6.0	8.0
4	0.0	0.0	1.0	29.0	1.0	0.0	1.0	1.0	...	1.0	2.0	0.0	0.0	0.0	8.0	5.0	8.0

Predicting Diabetes with Risk Factors¶

Citations:¶

Tabish SA. Is Diabetes Becoming the Biggest Epidemic of the Twenty-first Century? Int J Health Sci (Qassim). 2007 Jul;1(2):V-VIII. PMID: 21475425; PMCID: PMC3068646.¶

“Diabetes.” World Health Organization, 16 Sept. 2022, https://www.who.int/news-room/fact-sheets/detail/diabetes.¶

Dataset¶

from kaggle ¶

Data dictionary¶

Project plan and purpose¶

Predicting Diabetes with Risk Factors¶

Citations:¶

Tabish SA. Is Diabetes Becoming the Biggest Epidemic of the Twenty-first Century? Int J Health Sci (Qassim). 2007 Jul;1(2):V-VIII. PMID: 21475425; PMCID: PMC3068646.¶

“Diabetes.” World Health Organization, 16 Sept. 2022, https://www.who.int/news-room/fact-sheets/detail/diabetes.¶

Dataset¶

from kaggle¶

Data dictionary¶

Project plan and purpose¶

from kaggle ¶