import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False

data0408=pd.read_excel("各地区企业避税天堂指数排名.xlsx")
data0408.head(5)

data0408.shape

(140, 8)

data0408.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140 entries, 0 to 139
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   年度        140 non-null    int64  
 1   地区排名      140 non-null    int64  
 2   地区名称      140 non-null    object 
 3   地区国际代码    140 non-null    object 
 4   CTHI占比    140 non-null    float64
 5   避税评分      140 non-null    float64
 6   全球规模比重    140 non-null    float64
 7   企业避税天堂指数  140 non-null    int64  
dtypes: float64(3), int64(3), object(2)
memory usage: 8.9+ KB

data0408.describe()

data0408.isnull().sum()

年度          0
地区排名        0
地区名称        0
地区国际代码      0
CTHI占比      0
避税评分        0
全球规模比重      0
企业避税天堂指数    0
dtype: int64

data0408.duplicated().sum()

0

# 数值数据绘图
numeric_data=data0408.select_dtypes(include=['float64','int64']).columns
print(numeric_data)

for col in numeric_data:
    plt.figure()
    data0408[col].hist(bins=20)
    plt.xlabel(col)
    plt.ylabel('频数')
    plt.title(f"{col}分布")
    plt.show()

Index(['年度', '地区排名', 'CTHI占比', '避税评分', '全球规模比重', '企业避税天堂指数'], dtype='object')

import plotly.express as px
for col in numeric_data:
    fig = px.histogram(data0408, x=col, nbins=20)
    fig.update_layout(
        title=f"{col}分布",
        xaxis_title=col,
        yaxis_title="频数"
    )
    fig.show()

# 遍历每个数值列并绘制箱线图
for column in numeric_data:
    plt.figure()  # 创建新的绘图窗口
    data0408.boxplot(column=[column])
    plt.ylabel(column)
    plt.title(f'{column} 箱线图')
    plt.show()

# 计算相关系数矩阵
correlation_matrix = data0408[numeric_data].corr()

# 打印相关系数矩阵
print(correlation_matrix)

# 设置图片清晰度
plt.rcParams['figure.dpi'] = 300

# 创建一个较大尺寸的图形
plt.figure(figsize=(10, 8))

# 绘制热力图
sns.heatmap(correlation_matrix, annot=True, cmap='viridis', linewidths=0.5, annot_kws={'size': 10})

# 添加标题
plt.title('Correlation Matrix Heatmap', fontsize=16)

# 显示图形
plt.show()

                    年度          地区排名    CTHI占比      避税评分    全球规模比重  企业避税天堂指数
年度        1.000000e+00  5.818856e-14 -0.001155 -0.032411  0.041100 -0.013535
地区排名      5.818856e-14  1.000000e+00 -0.851756 -0.717165 -0.488688 -0.851799
CTHI占比   -1.154500e-03 -8.517560e-01  1.000000  0.700526  0.477954  0.999741
避税评分     -3.241095e-02 -7.171651e-01  0.700526  1.000000  0.056736  0.702175
全球规模比重    4.110031e-02 -4.886882e-01  0.477954  0.056736  1.000000  0.477304
企业避税天堂指数 -1.353519e-02 -8.517986e-01  0.999741  0.702175  0.477304  1.000000

# 按年份分组并计算统计描述
yearly_stats = data0408.groupby('年度标识')[numeric_data].describe()
print(yearly_stats)

# 按年份分组并计算各指标的均值
yearly_mean = data0408.groupby('年度标识')[numeric_data].mean()

# 绘制折线图
plt.figure(figsize=(12, 8))
for col in numeric_data:
    plt.plot(yearly_mean.index, yearly_mean[col], marker='o', label=col)

plt.title('各指标随年份的变化趋势')
plt.xlabel('年份')
plt.xticks(rotation=45)
plt.ylabel('均值')
plt.legend()
plt.grid(True)
plt.show()

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[13], line 2
      1 # 按年份分组并计算统计描述
----> 2 yearly_stats = data0408.groupby('年度标识')[numeric_data].describe()
      3 print(yearly_stats)
      5 # 按年份分组并计算各指标的均值

File D:\Python310\lib\site-packages\pandas\core\frame.py:9183, in DataFrame.groupby(self, by, axis, level, as_index, sort, group_keys, observed, dropna)
   9180 if level is None and by is None:
   9181     raise TypeError("You have to supply one of 'by' and 'level'")
-> 9183 return DataFrameGroupBy(
   9184     obj=self,
   9185     keys=by,
   9186     axis=axis,
   9187     level=level,
   9188     as_index=as_index,
   9189     sort=sort,
   9190     group_keys=group_keys,
   9191     observed=observed,
   9192     dropna=dropna,
   9193 )

File D:\Python310\lib\site-packages\pandas\core\groupby\groupby.py:1329, in GroupBy.__init__(self, obj, keys, axis, level, grouper, exclusions, selection, as_index, sort, group_keys, observed, dropna)
   1326 self.dropna = dropna
   1328 if grouper is None:
-> 1329     grouper, exclusions, obj = get_grouper(
   1330         obj,
   1331         keys,
   1332         axis=axis,
   1333         level=level,
   1334         sort=sort,
   1335         observed=False if observed is lib.no_default else observed,
   1336         dropna=self.dropna,
   1337     )
   1339 if observed is lib.no_default:
   1340     if any(ping._passed_categorical for ping in grouper.groupings):

File D:\Python310\lib\site-packages\pandas\core\groupby\grouper.py:1043, in get_grouper(obj, key, axis, level, sort, observed, validate, dropna)
   1041         in_axis, level, gpr = False, gpr, None
   1042     else:
-> 1043         raise KeyError(gpr)
   1044 elif isinstance(gpr, Grouper) and gpr.key is not None:
   1045     # Add key to exclusions
   1046     exclusions.add(gpr.key)

KeyError: '年度标识'

# 选择需要用于聚类的数值列
cluster_data = data0408[numeric_data]

# 使用 KMeans 进行聚类，这里假设分为 3 类，可根据实际情况调整
kmeans = KMeans(n_clusters=3, random_state=42)
data0408['cluster'] = kmeans.fit_predict(cluster_data)

# 查看每个聚类中的地区数量
print(data0408['cluster'].value_counts())

# 可视化聚类结果（以两个指标为例）
plt.figure(figsize=(10, 8))
plt.scatter(data0408[numeric_data[0]], data0408[numeric_data[1]], c=data0408['cluster'], cmap='viridis')
plt.xlabel(numeric_data[0])
plt.ylabel(numeric_data[1])
plt.title('地区聚类分析')
plt.show()

from sklearn.ensemble import RandomForestRegressor

# 准备特征和目标变量
X = data0408[numeric_data.drop('企业避税天堂指数')]
y = data0408['企业避税天堂指数']

# 训练随机森林模型
rf = RandomForestRegressor(random_state=42)
rf.fit(X, y)

# 获取特征重要性
feature_importance = pd.Series(rf.feature_importances_, index=X.columns)
print(feature_importance.sort_values(ascending=False))

CTHI占比    0.792820
地区排名      0.205040
全球规模比重    0.001122
避税评分      0.000837
年度        0.000181
dtype: float64

# 创建一个透视表，展示各地区在不同年份的排名
pivot_table = data0408.pivot(index='地区名称', columns='年度', values='地区排名')

# 计算每个地区排名的标准差，标准差越小表示排名越稳定
rank_stability = pivot_table.std(axis=1)
print(rank_stability.sort_values())

地区名称
芬兰        0.000000
塞浦路斯      0.000000
坦桑尼亚      0.000000
美国        0.000000
安道尔       0.000000
           ...    
丹麦        5.656854
马恩岛       5.656854
哥斯达黎加     6.363961
波兰        7.778175
摩纳哥      10.606602
Length: 70, dtype: float64

from scipy import stats
# 初始化一个空的布尔索引，用于标记包含异常值的行
outlier_mask = pd.Series([False] * len(data0408))

# 遍历每个数值型列
for col in numeric_data:
    # 计算该列的 Z-Score
    z_scores = stats.zscore(data0408[col])
    # 更新布尔索引，标记该列中 Z-Score 绝对值大于 3 的行
    outlier_mask = outlier_mask | (abs(z_scores) > 3)

# 根据布尔索引筛选出包含异常值的数据条目
outliers = data0408[outlier_mask]

# 打印包含异常值的数据条目
print(outliers)

      年度  地区排名     地区名称 地区国际代码  CTHI占比   避税评分  全球规模比重  企业避税天堂指数
0   2021     1  英属维尔京群岛    VGB    6.45  100.0    2.32      2853
3   2021     4       荷兰    NLD    5.54   79.9   11.09      2454
24  2021    25       美国    USA    1.16   46.8   12.36       514
70  2024     1  英属维尔京群岛    VGB    7.10  100.0    2.90      3061
71  2024     2     开曼群岛    CYM    6.70  100.0    2.40      2891
76  2024     7       荷兰    NLD    4.50   74.0   11.10      1945
94  2024    25       美国    USA    1.20   46.0   15.60       527

from scipy.stats import norm
# 获取数值型列
numeric_cols = data0408.select_dtypes(include=['float64', 'int64']).columns

# 获取年份的唯一值
years = data0408['年度'].unique()

# 遍历每个数值型列
for col in numeric_cols:
    # 为每个列创建一个图形
    plt.figure(figsize=(12, 8))
    for year in years:
        # 筛选该年份的数据
        year_data = data0408[data0408['年度'] == year][col].dropna()
        if len(year_data) > 0:
            # 绘制直方图
            n, bins, patches = plt.hist(year_data, bins=20, density=True, alpha=0.5, label=str(year))
            # 拟合正态分布
            mu, std = norm.fit(year_data)
            # 生成正态曲线的 x 值
            xmin, xmax = plt.xlim()
            x = np.linspace(xmin, xmax, 100)
            p = norm.pdf(x, mu, std)
            # 绘制正态曲线
            plt.plot(x, p, 'k', linewidth=2, label=f'{year} Normal Fit')
    # 设置图形标题和标签
    plt.title(f'{col} Distribution by Year with Normal Fit')
    plt.xlabel(col)
    plt.ylabel('Density')
    plt.legend()
    # 显示图形
    plt.show()

D:\Python310\lib\site-packages\scipy\stats\_distn_infrastructure.py:2093: RuntimeWarning:

divide by zero encountered in divide

	年度	地区排名	CTHI占比	避税评分	全球规模比重	企业避税天堂指数
count	140.000000	140.000000	140.000000	140.000000	140.000000	140.000000
mean	2022.500000	35.500000	1.426214	66.554286	1.344714	622.400000
std	1.505386	20.277748	1.676480	17.605564	2.675524	730.875387
min	2021.000000	1.000000	0.000000	29.200000	0.000000	3.000000
25%	2021.000000	18.000000	0.300000	54.625000	0.100000	141.500000
50%	2022.500000	35.500000	0.635000	64.400000	0.185000	281.500000
75%	2024.000000	53.000000	2.100000	75.500000	1.125000	899.000000
max	2024.000000	70.000000	7.100000	100.000000	15.600000	3061.000000

	年度	地区排名	地区名称	地区国际代码	CTHI占比	避税评分	全球规模比重	企业避税天堂指数
0	2021	1	英属维尔京群岛	VGB	6.45	100.0	2.32	2853
1	2021	2	开曼群岛	CYM	5.99	100.0	1.87	2653
2	2021	3	百慕达	BMU	5.67	100.0	1.58	2508
3	2021	4	荷兰	NLD	5.54	79.9	11.09	2454
4	2021	5	瑞士	CHE	5.11	88.5	3.44	2261

📘 002经济数据分析与可视化Exam复习/0408.ipynb