feat:解析新文件

This commit is contained in:
2026-03-22 13:39:38 +08:00
parent 1a67bb52d7
commit 10fb1e0896
13 changed files with 689 additions and 12 deletions

View File

@@ -102,20 +102,259 @@ def convert_to_yi(value):
def find_row_by_name(df, row_name):
"""在数据框中查找指定名称的行"""
first_col = df.iloc[:, 0]
# 尝试精确匹配
mask = first_col == row_name
if mask.any():
return df[mask]
# 尝试模糊匹配(去除空格)
mask = first_col.str.strip() == row_name.strip()
if mask.any():
return df[mask]
return None
def _parse_cell_number(cell):
"""将单元格解析为数字(支持 =123 格式),无法解析返回 None"""
if pd.isna(cell) or cell == '' or cell is None:
return None
s = str(cell).strip()
if s.startswith('='):
s = s[1:]
s = s.replace(',', '').replace(' ', '').replace('', '')
if not s or s.lower() == 'nan':
return None
try:
return float(s)
except (ValueError, TypeError):
return None
def _parse_date_cell(cell):
"""解析日期单元格,返回 'YYYY-MM-DD' 或 None"""
if pd.isna(cell) or cell == '' or cell is None:
return None
s = str(cell).strip()
if s.startswith('='):
s = s[1:].strip('"\'')
# 兼容 2025-12-31 或 2025/12/31
s = s.replace('/', '-')
if len(s) == 10 and s[4] == '-' and s[7] == '-':
return s
return None
def _date_to_quarter(date_str):
"""将日期转为季度显示2024-03-31 -> 2024Q1, 2024-06-30 -> 2024Q2 等"""
if not date_str or len(date_str) < 10:
return date_str or ''
year = date_str[:4]
end = date_str[5:] # MM-DD
if end == '03-31':
return f'{year}Q1'
if end == '06-30':
return f'{year}Q2'
if end == '09-30':
return f'{year}Q3'
if end == '12-31':
return f'{year}Q4'
# 其他日期按月份推断
try:
m = int(date_str[5:7])
q = (m - 1) // 3 + 1
return f'{year}Q{q}'
except (ValueError, TypeError):
return date_str
def build_profitability_df(input_file):
"""
从利润表原始文件构建「盈利能力」Tab 数据。
提取:营业收入、毛利(营业收入-营业成本)、毛利率、归母净利润。
当存在季度/半年日期(非12-31)时计算单季数据Q1=3-31Q2=6-30-3-31Q3=9-30-6-30Q4=12-31-9-30。
累计与单季之间空两列,时间竖排,金额转为亿。
返回pd.DataFrame含表头可直接写入 Excel。
"""
if input_file.endswith('.csv'):
df = pd.read_csv(input_file, encoding='utf-8')
else:
df = pd.read_excel(input_file)
# 日期行:第一列名为「日期」
date_row_idx = None
for i in range(min(5, len(df))):
if str(df.iloc[i, 0]).strip() == '日期':
date_row_idx = i
break
if date_row_idx is None:
print("盈利能力: 未找到日期行,跳过")
return None
# 数据列从第 4 列开始(跳过 财报类型/日期/股票代码/上市公司 等前几列)
data_start_col = 3
row_names = df.iloc[:, 0].astype(str).str.strip()
def get_row_values(row_name):
idx = row_names[row_names == row_name].index
if len(idx) == 0:
return None
row = df.iloc[idx[0]]
return [_parse_cell_number(row.iloc[c]) for c in range(data_start_col, len(row))]
def get_dates():
row = df.iloc[date_row_idx]
return [_parse_date_cell(row.iloc[c]) for c in range(data_start_col, len(row))]
dates = get_dates()
rev = get_row_values('营业收入')
cost = get_row_values('营业成本')
# 毛利率优先用报表里的「毛利率(GM)」,没有则用 毛利/营业收入 算
gross_margin = get_row_values('毛利率(GM)')
net_parent = get_row_values('归属于母公司股东及其他权益持有者的净利润')
if net_parent is None:
net_parent = get_row_values('归属于母公司普通股股东的净利润')
if rev is None or cost is None or net_parent is None:
print("盈利能力: 缺少营业收入/营业成本/归母净利润行,跳过")
return None
n = len(dates)
# 毛利 = 营业收入 - 营业成本
gross_profit = []
for i in range(n):
r, c = rev[i], cost[i]
if r is not None and c is not None:
gross_profit.append(r - c)
else:
gross_profit.append(None)
# 若没有毛利率行,用 毛利/营业收入
if gross_margin is None:
gross_margin = []
for i in range(n):
if gross_profit[i] is not None and rev[i] is not None and rev[i] != 0:
gross_margin.append(gross_profit[i] / rev[i])
else:
gross_margin.append(None)
else:
gross_margin = list(gross_margin)
# 按日期过滤有效行,并排序(时间竖排:从旧到新)
YI = 100000000 # 亿
rows_cum = []
for i in range(n):
if dates[i] is None:
continue
rows_cum.append({
'date': dates[i],
'rev': rev[i],
'gross': gross_profit[i],
'margin': gross_margin[i],
'net_parent': net_parent[i],
})
if not rows_cum:
return None
# 按日期降序(最新在前)
rows_cum.sort(key=lambda x: x['date'], reverse=True)
# 判断是否为季度/半年数据(存在非 12-31 的期末日)
has_quarterly = any(
(r['date'] or '').endswith('-03-31') or
(r['date'] or '').endswith('-06-30') or
(r['date'] or '').endswith('-09-30')
for r in rows_cum
)
# 累计转亿
def to_yi(v):
if v is None:
return None
return round(v / YI, 2)
result_rows = []
# 表头
if has_quarterly:
header = ['时间', '营业收入(亿)', '毛利(亿)', '毛利率', '归母净利润(亿)', '', '', '营业收入(亿)', '毛利(亿)', '毛利率', '归母净利润(亿)']
sub_header = ['', '累计', '累计', '累计', '累计', '', '', '单季', '单季', '单季', '单季']
else:
header = ['时间', '营业收入(亿)', '毛利(亿)', '毛利率', '归母净利润(亿)']
sub_header = None
result_rows.append(header)
if sub_header is not None:
result_rows.append(sub_header)
if not has_quarterly:
for r in rows_cum:
result_rows.append([
_date_to_quarter(r['date']),
to_yi(r['rev']),
to_yi(r['gross']),
round(r['margin'], 4) if r['margin'] is not None else None,
to_yi(r['net_parent']),
])
return pd.DataFrame(result_rows)
# 按年分组,计算单季
from collections import defaultdict
by_year = defaultdict(list)
for r in rows_cum:
y = (r['date'] or '')[:4]
if y:
by_year[y].append(r)
# 单季规则Q1=3-31, Q2=6-30-3-31, Q3=9-30-6-30, Q4=12-31-9-30同年内按 3-31,6-30,9-30,12-31 排序
quarter_order = ['-03-31', '-06-30', '-09-30', '-12-31']
single_quarter = [] # 与 rows_cum 对齐:每个期末日对应一行,单季数据按期末日填
date_to_single = {} # date -> { rev, gross, margin, net_parent }
for year in sorted(by_year.keys()):
items = by_year[year]
items_sorted = sorted(items, key=lambda x: (x['date'] or ''))
prev_rev = prev_gross = prev_net = None
for item in items_sorted:
d = item['date']
rev_val, gross_val, net_val = item['rev'], item['gross'], item['net_parent']
if d.endswith('-03-31'):
sq_rev = rev_val
sq_gross = gross_val
sq_net = net_val
else:
if prev_rev is None:
sq_rev = sq_gross = sq_net = None
else:
sq_rev = rev_val - prev_rev if rev_val is not None and prev_rev is not None else None
sq_gross = gross_val - prev_gross if gross_val is not None and prev_gross is not None else None
sq_net = net_val - prev_net if net_val is not None and prev_net is not None else None
prev_rev, prev_gross, prev_net = rev_val, gross_val, net_val
sq_margin = (sq_gross / sq_rev) if (sq_rev and sq_rev != 0 and sq_gross is not None) else None
date_to_single[d] = {'rev': sq_rev, 'gross': sq_gross, 'margin': sq_margin, 'net_parent': sq_net}
for r in rows_cum:
d = r['date']
sq = date_to_single.get(d, {})
result_rows.append([
_date_to_quarter(d),
to_yi(r['rev']),
to_yi(r['gross']),
round(r['margin'], 4) if r['margin'] is not None else None,
to_yi(r['net_parent']),
'', '',
to_yi(sq.get('rev')),
to_yi(sq.get('gross')),
round(sq['margin'], 4) if sq.get('margin') is not None else None,
to_yi(sq.get('net_parent')),
])
return pd.DataFrame(result_rows)
def process_income_statement_return_df(input_file, config_file):
"""
处理利润表文件并返回 DataFrame