In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')
plt.style.use('ggplot')
avocado_data = pd.read_csv('Avocado_HassAvocadoBoard_20152023v1.0.1.csv')
avocado_data.head()
Out[2]:
| Date | AveragePrice | TotalVolume | plu4046 | plu4225 | plu4770 | TotalBags | SmallBags | LargeBags | XLargeBags | type | region | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2015-01-04 | 1.22 | 40873.28 | 2819.50 | 28287.42 | 49.90 | 9716.46 | 9186.93 | 529.53 | 0.0 | conventional | Albany |
| 1 | 2015-01-04 | 1.79 | 1373.95 | 57.42 | 153.88 | 0.00 | 1162.65 | 1162.65 | 0.00 | 0.0 | organic | Albany |
| 2 | 2015-01-04 | 1.00 | 435021.49 | 364302.39 | 23821.16 | 82.15 | 46815.79 | 16707.15 | 30108.64 | 0.0 | conventional | Atlanta |
| 3 | 2015-01-04 | 1.76 | 3846.69 | 1500.15 | 938.35 | 0.00 | 1408.19 | 1071.35 | 336.84 | 0.0 | organic | Atlanta |
| 4 | 2015-01-04 | 1.08 | 788025.06 | 53987.31 | 552906.04 | 39995.03 | 141136.68 | 137146.07 | 3990.61 | 0.0 | conventional | BaltimoreWashington |
In [3]:
# 检查并计数重复的行
duplicate_count = avocado_data.duplicated().sum()
print(f"Number of duplicated rows: {duplicate_count}")
Number of duplicated rows: 0
In [4]:
# 根据“平均价格”列中的IQR识别异常值
Q1 = avocado_data['AveragePrice'].quantile(0.25)
Q3 = avocado_data['AveragePrice'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers_price = (avocado_data['AveragePrice'] < lower_bound) | (avocado_data['AveragePrice'] > upper_bound)
# 打印异常值的数量
print(f"Number of outliers in AveragePrice: {outliers_price.sum()}")
Number of outliers in AveragePrice: 358
In [5]:
avocado_data=avocado_data[~outliers_price]
In [8]:
# 识别有缺失值的行
missing_rows = avocado_data[avocado_data[['SmallBags', 'LargeBags', 'XLargeBags']].isnull().any(axis=1)]
# 遍历缺失的行
for index, row in missing_rows.iterrows():
# 生成总和为1的随机百分比
random_percentages = np.random.dirichlet(np.ones(3), size=1)[0]
# 计算使总和等于TotalBags所需的剩余值
remaining_value = row['TotalBags'] - row[['SmallBags', 'LargeBags', 'XLargeBags']].sum()
# 用随机百分比填充缺失值
avocado_data.at[index, 'SmallBags'] = remaining_value * random_percentages[0]
avocado_data.at[index, 'LargeBags'] = remaining_value * random_percentages[1]
avocado_data.at[index, 'XLargeBags'] = remaining_value * random_percentages[2]
# 验证是否没有其他缺失的值
print(avocado_data[['TotalBags', 'SmallBags', 'LargeBags', 'XLargeBags']].isnull().sum())
TotalBags 0 SmallBags 0 LargeBags 0 XLargeBags 0 dtype: int64
In [9]:
avocado_data[['plu4046', 'plu4225', 'plu4770']] = avocado_data[['plu4046', 'plu4225', 'plu4770']].astype(str)
avocado_data[['TotalBags', 'SmallBags', 'LargeBags', 'XLargeBags']] = avocado_data[['TotalBags', 'SmallBags', 'LargeBags', 'XLargeBags']].astype(int)
In [11]:
# 每个PLU代码的唯一值的数量
unique_counts = [avocado_data['plu4046'].nunique(), avocado_data['plu4225'].nunique(), avocado_data['plu4770'].nunique()]
plu_codes = ['PLU 4046', 'PLU 4225', 'PLU 4770']
barplot = sns.barplot(x=plu_codes, y=unique_counts, palette='viridis')
plt.title('Number of Unique Values for PLU Codes')
plt.xlabel('PLU Codes')
plt.ylabel('Number of Unique Values')
#在条形图的顶部添加数据值
for i, count in enumerate(unique_counts):
barplot.text(i, count + 0.1, str(count), ha='center', va='bottom')
plt.show()
In [12]:
# 计算所有行中每种包类型的和
total_small_bags = avocado_data['SmallBags'].sum()
total_large_bags = avocado_data['LargeBags'].sum()
total_xlarge_bags = avocado_data['XLargeBags'].sum()
# 为饼图创建数据
sizes = [total_small_bags, total_large_bags, total_xlarge_bags]
labels = ['SmallBags', 'LargeBags', 'XLargeBags']
colors = ['lightcoral', 'lightskyblue', 'lightgreen']
# 绘制饼状图
plt.figure(figsize=(4, 4))
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Bag Types')
plt.show()
In [13]:
# 计算“type”的分布
type_distribution = avocado_data['type'].value_counts()
# 为饼状图创建数据
sizes = type_distribution.values
labels = type_distribution.index
colors = ['lightcoral', 'lightskyblue']
# 绘制饼状图
plt.figure(figsize=(4, 4))
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Avocado Types')
plt.show()
In [14]:
# 按“地区”分组,并计算每个地区的平均“平均价格”
avg_price_by_region = avocado_data.groupby('region')['AveragePrice'].mean()
sorted_avg_price_by_region = avg_price_by_region.sort_values(ascending=False)
top_regions = sorted_avg_price_by_region.head(10)
# 使用Plotly创建交互式水平条形图
fig = px.bar(x=top_regions.values, y=top_regions.index, orientation='h', color=top_regions.values,
labels={'x': 'Average Price', 'y': 'Region'}, title='Top Regions by Average Price',
color_continuous_scale='Plasma')
fig.show()
In [15]:
# 按“区域”分组,并计算每个区域的平均“TotalVolume”
avg_volume_by_region = avocado_data.groupby('region')['TotalVolume'].mean()
sorted_avg_volume_by_region = avg_volume_by_region.sort_values(ascending=False)
top_regions_volume = sorted_avg_volume_by_region.head(10)
# 使用Plotly创建交互式水平条形图
fig = px.bar(x=top_regions_volume.values, y=top_regions_volume.index, orientation='h', color=top_regions_volume.values,
labels={'x': 'Average Total Volume', 'y': 'Region'}, title='Top Regions by Average Total Volume',
color_continuous_scale='Jet')
fig.show()
In [16]:
# 过滤“TotalUS”区域的数据
total_us_data = avocado_data[avocado_data['region'] == 'TotalUS']
# 过滤其他区域的数据
other_regions_data = avocado_data[avocado_data['region'] != 'TotalUS']
# 计算“TotalUS”区域的“TotalVolume”之和
total_us_volume = total_us_data['TotalVolume'].sum()
# 计算其他区域的“TotalVolume”之和
other_regions_volume = other_regions_data.groupby('region')['TotalVolume'].sum().sum()
# 创建一个条形图来比较“TotalVolume”与“TotalUS”和其他
sns.barplot(x=['TotalUS', 'Other Regions'], y=[total_us_volume, other_regions_volume], palette='viridis')
plt.xlabel('Region')
plt.ylabel('Total Volume')
plt.title('Comparison of Total Volume for TotalUS and Other Regions')
plt.show()
In [17]:
# 可视化AveragePrice的分布
plt.figure(figsize=(6, 4))
sns.histplot(avocado_data['AveragePrice'], bins=30, kde=True, color='blue')
plt.title('Distribution of Average Price')
plt.show()
In [18]:
# 可视化TotalVolume的分布
plt.figure(figsize=(6, 4))
sns.histplot(avocado_data['TotalVolume'], bins=30, kde=True, color='green')
plt.title('Distribution of Total Volume')
plt.show()
In [19]:
# 将“日期”列转换为日期时间格式
avocado_data['Date'] = pd.to_datetime(avocado_data['Date'])
# 随时间变化的平均价格
plt.figure(figsize=(14, 6))
sns.lineplot(x='Date', y='AveragePrice', data=avocado_data, color='orange')
plt.title('Average Avocado Price Over Time')
plt.show()
In [20]:
# 交互式绘制平均价格随时间的变化
fig = px.line(avocado_data, x='Date', y='AveragePrice', title='Average Avocado Price Over Time', markers=True)
fig.update_layout(xaxis_title='Date', yaxis_title='Average Price')
fig.show()
In [21]:
# 绘制总体积随时间的变化
plt.figure(figsize=(14, 6))
sns.lineplot(x='Date', y='TotalVolume', data=avocado_data, color='purple')
plt.title('Total Avocado Volume Over Time')
plt.show()
In [22]:
# 交互式绘制平均价格随时间的变化
fig = px.line(avocado_data, x='Date', y='TotalVolume', title='Total Avocado Volume Over Time', markers=True)
fig.update_layout(xaxis_title='Date', yaxis_title='Total Volume')
fig.show()
In [23]:
# 设置自定义箱边
custom_bin_edges = [0, 50000, 100000, 150000, np.inf]
# 为TotalVolume创建箱子
avocado_data['VolumeCategory'] = pd.cut(avocado_data['TotalVolume'], bins=custom_bin_edges, labels=['Low', 'Medium', 'High', 'Very High'])
# 为“TotalBags”创建箱子
avocado_data['BagsCategory'] = pd.cut(avocado_data['TotalBags'], bins=custom_bin_edges, labels=['Low', 'Medium', 'High', 'Very High'])
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
# “TotalVolume”的子图
ax1 = sns.countplot(x='VolumeCategory', data=avocado_data, palette='viridis', ax=axes[0])
ax1.set_title('Distribution of TotalVolume Categories')
ax1.set_xlabel('TotalVolume Category')
ax1.set_ylabel('Count')
# 为“TotalVolume”添加数据标签
for p in ax1.patches:
ax1.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='baseline', fontsize=10, color='black')
# “TotalBags”的子图
ax2 = sns.countplot(x='BagsCategory', data=avocado_data, palette='viridis', ax=axes[1])
ax2.set_title('Distribution of TotalBags Categories')
ax2.set_xlabel('TotalBags Category')
ax2.set_ylabel('Count')
# 为“TotalBags”添加数据标签
for p in ax2.patches:
ax2.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='baseline', fontsize=10, color='black')
plt.tight_layout()
plt.show()
In [24]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
# 创建DataFrame的副本
X = avocado_data[['Date', 'type', 'region', 'VolumeCategory', 'BagsCategory']].copy()
y = avocado_data['AveragePrice']
# 提取年、月、日
X['Year'] = X['Date'].dt.year
X['Month'] = X['Date'].dt.month
X['Day'] = X['Date'].dt.day
# 删除原来的“Date”列
X = X.drop('Date', axis=1)
# 初始化LabelEncoder
label_encoder = LabelEncoder()
# 对分类列应用标签编码
X['type'] = label_encoder.fit_transform(X['type'])
X['region'] = label_encoder.fit_transform(X['region'])
X['VolumeCategory'] = label_encoder.fit_transform(X['VolumeCategory'])
X['BagsCategory'] = label_encoder.fit_transform(X['BagsCategory'])
# 将数据分成训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 初始化和训练线性回归模型
model = LinearRegression()
model.fit(X_train, y_train)
# 对测试集进行预测
predictions = model.predict(X_test)
# 评估模型
mae = mean_absolute_error(y_test, predictions)
print(f'Mean Absolute Error: {mae}')
Mean Absolute Error: 0.21999210673812558
In [25]:
# 创建1行2列的子图
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 6))
# 带图例的实际价格与预测价格的散点图
sns.scatterplot(x=y_test, y=predictions, label='Actual Prices', alpha=0.7, ax=axes[0])
sns.scatterplot(x=y_test, y=y_test, label='Predicted Prices', alpha=0.7, ax=axes[0])
axes[0].set_title('Actual vs. Predicted Prices with Legends')
axes[0].set_xlabel('Actual Prices')
axes[0].set_ylabel('Predicted Prices')
axes[0].legend()
# 带有图例的残差图
residuals = y_test - predictions
sns.scatterplot(x=predictions, y=residuals, label='Residuals', alpha=0.7, ax=axes[1])
axes[1].axhline(y=0, color='black', linestyle='--', label='Zero Residual Line')
axes[1].set_title('Residual Plot with Legends')
axes[1].set_xlabel('Predicted Prices')
axes[1].set_ylabel('Residuals')
axes[1].legend()
plt.tight_layout()
plt.show()
In [ ]: