Files
organize_excel_data/process_income_statement.py
2026-03-22 13:39:38 +08:00

560 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
利润表数据处理脚本
功能:
1. 将数字转换为以亿为单位(保留两位小数)
2. 删除配置文件中指定的行
3. 第一行为日期
4. 输出为 Excel 格式
"""
import pandas as pd
import json
import os
from pathlib import Path
def load_config(config_path):
"""加载配置文件"""
with open(config_path, 'r', encoding='utf-8') as f:
config = json.load(f)
return config
def is_decimal_number(value):
"""判断是否为小数绝对值小于1的数字"""
if pd.isna(value) or value == '' or value is None:
return False
try:
num = float(value)
# 如果绝对值小于1认为是小数/比率
return abs(num) < 1
except (ValueError, TypeError):
return False
def is_large_number(value):
"""判断是否为需要转换的大数字(绝对值>=1"""
if pd.isna(value) or value == '' or value is None:
return False
try:
num = float(value)
# 如果绝对值>=1认为是需要转换的大数字
return abs(num) >= 1
except (ValueError, TypeError):
return False
def convert_to_yi(value):
"""将数字转换为以亿为单位,保留两位小数"""
if pd.isna(value) or value == '' or value is None:
return value
# 统一转换为浮点数进行处理
num = None
# 如果已经是数值类型,直接使用
if isinstance(value, (int, float)):
num = float(value)
else:
# 处理字符串格式
try:
# 转换为字符串并清理
str_value = str(value).strip()
# 去除等号Excel公式格式如 "=557395000000"
if str_value.startswith('='):
str_value = str_value[1:]
# 去除常见的数字格式字符(逗号、空格等)
str_value = str_value.replace(',', '').replace(' ', '').replace('', '')
# 如果为空,返回原值
if str_value == '' or str_value.lower() == 'nan':
return value
# 转换为浮点数(支持科学计数法,如 "2.07327E+12"
num = float(str_value)
except (ValueError, TypeError):
# 如果无法转换,返回原值
return value
# 如果成功转换为数字
if num is not None:
# 如果是小数(比率),返回去除等号后的值
if abs(num) < 1:
# 如果原值是字符串且以等号开头,返回去除等号后的值
if isinstance(value, str) and value.strip().startswith('='):
return str_value
return value
# 转换为亿,保留两位小数
yi_value = num / 100000000
return round(yi_value, 2)
# 如果无法处理,返回原值
return value
def find_row_by_name(df, row_name):
"""在数据框中查找指定名称的行"""
first_col = df.iloc[:, 0]
# 尝试精确匹配
mask = first_col == row_name
if mask.any():
return df[mask]
# 尝试模糊匹配(去除空格)
mask = first_col.str.strip() == row_name.strip()
if mask.any():
return df[mask]
return None
def _parse_cell_number(cell):
"""将单元格解析为数字(支持 =123 格式),无法解析返回 None"""
if pd.isna(cell) or cell == '' or cell is None:
return None
s = str(cell).strip()
if s.startswith('='):
s = s[1:]
s = s.replace(',', '').replace(' ', '').replace('', '')
if not s or s.lower() == 'nan':
return None
try:
return float(s)
except (ValueError, TypeError):
return None
def _parse_date_cell(cell):
"""解析日期单元格,返回 'YYYY-MM-DD' 或 None"""
if pd.isna(cell) or cell == '' or cell is None:
return None
s = str(cell).strip()
if s.startswith('='):
s = s[1:].strip('"\'')
# 兼容 2025-12-31 或 2025/12/31
s = s.replace('/', '-')
if len(s) == 10 and s[4] == '-' and s[7] == '-':
return s
return None
def _date_to_quarter(date_str):
"""将日期转为季度显示2024-03-31 -> 2024Q1, 2024-06-30 -> 2024Q2 等"""
if not date_str or len(date_str) < 10:
return date_str or ''
year = date_str[:4]
end = date_str[5:] # MM-DD
if end == '03-31':
return f'{year}Q1'
if end == '06-30':
return f'{year}Q2'
if end == '09-30':
return f'{year}Q3'
if end == '12-31':
return f'{year}Q4'
# 其他日期按月份推断
try:
m = int(date_str[5:7])
q = (m - 1) // 3 + 1
return f'{year}Q{q}'
except (ValueError, TypeError):
return date_str
def build_profitability_df(input_file):
"""
从利润表原始文件构建「盈利能力」Tab 数据。
提取:营业收入、毛利(营业收入-营业成本)、毛利率、归母净利润。
当存在季度/半年日期(非12-31)时计算单季数据Q1=3-31Q2=6-30-3-31Q3=9-30-6-30Q4=12-31-9-30。
累计与单季之间空两列,时间竖排,金额转为亿。
返回pd.DataFrame含表头可直接写入 Excel。
"""
if input_file.endswith('.csv'):
df = pd.read_csv(input_file, encoding='utf-8')
else:
df = pd.read_excel(input_file)
# 日期行:第一列名为「日期」
date_row_idx = None
for i in range(min(5, len(df))):
if str(df.iloc[i, 0]).strip() == '日期':
date_row_idx = i
break
if date_row_idx is None:
print("盈利能力: 未找到日期行,跳过")
return None
# 数据列从第 4 列开始(跳过 财报类型/日期/股票代码/上市公司 等前几列)
data_start_col = 3
row_names = df.iloc[:, 0].astype(str).str.strip()
def get_row_values(row_name):
idx = row_names[row_names == row_name].index
if len(idx) == 0:
return None
row = df.iloc[idx[0]]
return [_parse_cell_number(row.iloc[c]) for c in range(data_start_col, len(row))]
def get_dates():
row = df.iloc[date_row_idx]
return [_parse_date_cell(row.iloc[c]) for c in range(data_start_col, len(row))]
dates = get_dates()
rev = get_row_values('营业收入')
cost = get_row_values('营业成本')
# 毛利率优先用报表里的「毛利率(GM)」,没有则用 毛利/营业收入 算
gross_margin = get_row_values('毛利率(GM)')
net_parent = get_row_values('归属于母公司股东及其他权益持有者的净利润')
if net_parent is None:
net_parent = get_row_values('归属于母公司普通股股东的净利润')
if rev is None or cost is None or net_parent is None:
print("盈利能力: 缺少营业收入/营业成本/归母净利润行,跳过")
return None
n = len(dates)
# 毛利 = 营业收入 - 营业成本
gross_profit = []
for i in range(n):
r, c = rev[i], cost[i]
if r is not None and c is not None:
gross_profit.append(r - c)
else:
gross_profit.append(None)
# 若没有毛利率行,用 毛利/营业收入
if gross_margin is None:
gross_margin = []
for i in range(n):
if gross_profit[i] is not None and rev[i] is not None and rev[i] != 0:
gross_margin.append(gross_profit[i] / rev[i])
else:
gross_margin.append(None)
else:
gross_margin = list(gross_margin)
# 按日期过滤有效行,并排序(时间竖排:从旧到新)
YI = 100000000 # 亿
rows_cum = []
for i in range(n):
if dates[i] is None:
continue
rows_cum.append({
'date': dates[i],
'rev': rev[i],
'gross': gross_profit[i],
'margin': gross_margin[i],
'net_parent': net_parent[i],
})
if not rows_cum:
return None
# 按日期降序(最新在前)
rows_cum.sort(key=lambda x: x['date'], reverse=True)
# 判断是否为季度/半年数据(存在非 12-31 的期末日)
has_quarterly = any(
(r['date'] or '').endswith('-03-31') or
(r['date'] or '').endswith('-06-30') or
(r['date'] or '').endswith('-09-30')
for r in rows_cum
)
# 累计转亿
def to_yi(v):
if v is None:
return None
return round(v / YI, 2)
result_rows = []
# 表头
if has_quarterly:
header = ['时间', '营业收入(亿)', '毛利(亿)', '毛利率', '归母净利润(亿)', '', '', '营业收入(亿)', '毛利(亿)', '毛利率', '归母净利润(亿)']
sub_header = ['', '累计', '累计', '累计', '累计', '', '', '单季', '单季', '单季', '单季']
else:
header = ['时间', '营业收入(亿)', '毛利(亿)', '毛利率', '归母净利润(亿)']
sub_header = None
result_rows.append(header)
if sub_header is not None:
result_rows.append(sub_header)
if not has_quarterly:
for r in rows_cum:
result_rows.append([
_date_to_quarter(r['date']),
to_yi(r['rev']),
to_yi(r['gross']),
round(r['margin'], 4) if r['margin'] is not None else None,
to_yi(r['net_parent']),
])
return pd.DataFrame(result_rows)
# 按年分组,计算单季
from collections import defaultdict
by_year = defaultdict(list)
for r in rows_cum:
y = (r['date'] or '')[:4]
if y:
by_year[y].append(r)
# 单季规则Q1=3-31, Q2=6-30-3-31, Q3=9-30-6-30, Q4=12-31-9-30同年内按 3-31,6-30,9-30,12-31 排序
quarter_order = ['-03-31', '-06-30', '-09-30', '-12-31']
single_quarter = [] # 与 rows_cum 对齐:每个期末日对应一行,单季数据按期末日填
date_to_single = {} # date -> { rev, gross, margin, net_parent }
for year in sorted(by_year.keys()):
items = by_year[year]
items_sorted = sorted(items, key=lambda x: (x['date'] or ''))
prev_rev = prev_gross = prev_net = None
for item in items_sorted:
d = item['date']
rev_val, gross_val, net_val = item['rev'], item['gross'], item['net_parent']
if d.endswith('-03-31'):
sq_rev = rev_val
sq_gross = gross_val
sq_net = net_val
else:
if prev_rev is None:
sq_rev = sq_gross = sq_net = None
else:
sq_rev = rev_val - prev_rev if rev_val is not None and prev_rev is not None else None
sq_gross = gross_val - prev_gross if gross_val is not None and prev_gross is not None else None
sq_net = net_val - prev_net if net_val is not None and prev_net is not None else None
prev_rev, prev_gross, prev_net = rev_val, gross_val, net_val
sq_margin = (sq_gross / sq_rev) if (sq_rev and sq_rev != 0 and sq_gross is not None) else None
date_to_single[d] = {'rev': sq_rev, 'gross': sq_gross, 'margin': sq_margin, 'net_parent': sq_net}
for r in rows_cum:
d = r['date']
sq = date_to_single.get(d, {})
result_rows.append([
_date_to_quarter(d),
to_yi(r['rev']),
to_yi(r['gross']),
round(r['margin'], 4) if r['margin'] is not None else None,
to_yi(r['net_parent']),
'', '',
to_yi(sq.get('rev')),
to_yi(sq.get('gross')),
round(sq['margin'], 4) if sq.get('margin') is not None else None,
to_yi(sq.get('net_parent')),
])
return pd.DataFrame(result_rows)
def process_income_statement_return_df(input_file, config_file):
"""
处理利润表文件并返回 DataFrame
参数:
input_file: 输入的CSV/Excel文件路径
config_file: 配置文件路径
返回:
pandas.DataFrame: 处理后的数据框
"""
print(f"正在读取文件: {input_file}")
# 读取数据文件
if input_file.endswith('.csv'):
df = pd.read_csv(input_file, encoding='utf-8')
else:
df = pd.read_excel(input_file)
print(f"文件读取成功,共 {len(df)}")
# 加载配置文件
config = load_config(config_file)
rows_to_delete = config.get('rows_to_delete', [])
print(f"配置文件加载成功,需要删除 {len(rows_to_delete)} 种行")
# 记录要删除的行索引
indices_to_delete = set()
# 查找并标记要删除的行
for row_name in rows_to_delete:
row_data = find_row_by_name(df, row_name)
if row_data is not None and len(row_data) > 0:
for idx in row_data.index:
indices_to_delete.add(idx)
print(f" 标记删除: {row_name}")
print(f"\n共标记 {len(indices_to_delete)} 行待删除")
# 创建结果数据框
result_rows = []
# 首先添加日期行
date_row = find_row_by_name(df, '日期')
date_idx = None
if date_row is not None and len(date_row) > 0:
date_idx = date_row.index[0]
result_rows.append(df.iloc[date_idx].tolist())
indices_to_delete.add(date_idx)
print("已添加日期行作为第一行")
else:
print("警告: 未找到日期行")
# 处理其他行
print("\n开始处理数据...")
processed_count = 0
for idx in range(len(df)):
# 跳过要删除的行
if idx in indices_to_delete:
continue
row = df.iloc[idx].tolist()
processed_row = []
# 处理每一列
for col_idx, value in enumerate(row):
if col_idx == 0:
# 第一列是名称,不处理
processed_row.append(value)
else:
# 其他列,转换大数字为亿
converted_value = convert_to_yi(value)
processed_row.append(converted_value)
result_rows.append(processed_row)
processed_count += 1
print(f"共处理 {processed_count} 行数据")
# 创建结果数据框(不使用列名)
result_df = pd.DataFrame(result_rows)
print("\n处理完成!")
print(f"总行数: {len(result_df)}")
return result_df
def process_income_statement(input_file, config_file, output_file):
"""
处理利润表文件并保存
参数:
input_file: 输入的CSV/Excel文件路径
config_file: 配置文件路径
output_file: 输出文件路径
"""
# 调用处理函数获取 DataFrame
result_df = process_income_statement_return_df(input_file, config_file)
# 保存结果
print(f"\n正在保存结果到: {output_file}")
# 强制使用 xlsx 格式
if not output_file.endswith('.xlsx'):
output_file = output_file.rsplit('.', 1)[0] + '.xlsx'
# 保存时不写入列名和索引
result_df.to_excel(output_file, index=False, header=False, engine='openpyxl')
print(f"输出文件: {output_file}")
def main():
"""主函数"""
# 获取脚本所在目录
script_dir = Path(__file__).parent
# 设置路径
input_dir = script_dir / 'input'
config_dir = script_dir / 'config'
output_dir = script_dir / 'output'
# 创建输出目录
output_dir.mkdir(exist_ok=True)
# 配置文件路径
config_file = config_dir / 'income_statement_config.json'
if not config_file.exists():
print(f"错误: 配置文件不存在: {config_file}")
return
# 查找input目录下的文件
if not input_dir.exists():
print(f"错误: input目录不存在: {input_dir}")
return
# 获取所有CSV和Excel文件可以根据文件名筛选利润表
input_files = []
for pattern in ['*利润表*.csv', '*利润表*.xlsx', '*利润表*.xls',
'*income*.csv', '*income*.xlsx', '*income*.xls']:
input_files.extend(list(input_dir.glob(pattern)))
# 去重
input_files = list(set(input_files))
if not input_files:
print(f"警告: 在 {input_dir} 目录下未找到利润表文件")
print("提示: 文件名应包含'利润表''income'")
print("\n如需处理所有文件,请修改脚本中的文件匹配规则")
# 备选:处理所有文件
all_files = list(input_dir.glob('*.csv')) + \
list(input_dir.glob('*.xlsx')) + \
list(input_dir.glob('*.xls'))
if all_files:
print(f"\n发现 {len(all_files)} 个文件,是否全部处理?")
input_files = all_files
if not input_files:
print("未找到任何文件")
return
print(f"找到 {len(input_files)} 个文件待处理\n")
# 处理每个文件
for input_file in input_files:
print(f"\n{'='*60}")
print(f"处理文件: {input_file.name}")
print(f"{'='*60}")
# 生成输出文件名(统一使用 .xlsx 扩展名,去除"合并报表_"
base_name = input_file.stem # 不带扩展名的文件名
# 清理文件名
import re
base_name = base_name.replace('合并报表_', '').replace('合并报表', '')
base_name = re.sub(r'_+', '_', base_name).strip('_')
output_file = output_dir / f"{base_name}.xlsx"
try:
process_income_statement(
str(input_file),
str(config_file),
str(output_file)
)
except Exception as e:
print(f"处理文件时出错: {e}")
import traceback
traceback.print_exc()
print(f"\n{'='*60}")
print("所有文件处理完成!")
print(f"结果保存在: {output_dir}")
print(f"{'='*60}")
if __name__ == '__main__':
main()