organize_excel_data/process_income_statement.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
利润表数据处理脚本
功能：
1. 将数字转换为以亿为单位（保留两位小数）
2. 删除配置文件中指定的行
3. 第一行为日期
4. 输出为 Excel 格式
"""

import pandas as pd
import json
import os
from pathlib import Path


def load_config(config_path):
    """加载配置文件"""
    with open(config_path, 'r', encoding='utf-8') as f:
        config = json.load(f)
    return config


def is_decimal_number(value):
    """判断是否为小数（绝对值小于1的数字）"""
    if pd.isna(value) or value == '' or value is None:
        return False

    try:
        num = float(value)
        # 如果绝对值小于1，认为是小数/比率
        return abs(num) < 1
    except (ValueError, TypeError):
        return False


def is_large_number(value):
    """判断是否为需要转换的大数字（绝对值>=1）"""
    if pd.isna(value) or value == '' or value is None:
        return False

    try:
        num = float(value)
        # 如果绝对值>=1，认为是需要转换的大数字
        return abs(num) >= 1
    except (ValueError, TypeError):
        return False


def convert_to_yi(value):
    """将数字转换为以亿为单位，保留两位小数"""
    if pd.isna(value) or value == '' or value is None:
        return value

    # 统一转换为浮点数进行处理
    num = None

    # 如果已经是数值类型，直接使用
    if isinstance(value, (int, float)):
        num = float(value)
    else:
        # 处理字符串格式
        try:
            # 转换为字符串并清理
            str_value = str(value).strip()

            # 去除等号（Excel公式格式，如 "=557395000000"）
            if str_value.startswith('='):
                str_value = str_value[1:]

            # 去除常见的数字格式字符（逗号、空格等）
            str_value = str_value.replace(',', '').replace(' ', '').replace('，', '')

            # 如果为空，返回原值
            if str_value == '' or str_value.lower() == 'nan':
                return value

            # 转换为浮点数（支持科学计数法，如 "2.07327E+12"）
            num = float(str_value)
        except (ValueError, TypeError):
            # 如果无法转换，返回原值
            return value

    # 如果成功转换为数字
    if num is not None:
        # 如果是小数（比率），返回去除等号后的值
        if abs(num) < 1:
            # 如果原值是字符串且以等号开头，返回去除等号后的值
            if isinstance(value, str) and value.strip().startswith('='):
                return str_value
            return value

        # 转换为亿，保留两位小数
        yi_value = num / 100000000
        return round(yi_value, 2)

    # 如果无法处理，返回原值
    return value


def find_row_by_name(df, row_name):
    """在数据框中查找指定名称的行"""
    first_col = df.iloc[:, 0]

    # 尝试精确匹配
    mask = first_col == row_name
    if mask.any():
        return df[mask]

    # 尝试模糊匹配（去除空格）
    mask = first_col.str.strip() == row_name.strip()
    if mask.any():
        return df[mask]

    return None


def _parse_cell_number(cell):
    """将单元格解析为数字（支持 =123 格式），无法解析返回 None"""
    if pd.isna(cell) or cell == '' or cell is None:
        return None
    s = str(cell).strip()
    if s.startswith('='):
        s = s[1:]
    s = s.replace(',', '').replace(' ', '').replace('，', '')
    if not s or s.lower() == 'nan':
        return None
    try:
        return float(s)
    except (ValueError, TypeError):
        return None


def _parse_date_cell(cell):
    """解析日期单元格，返回 'YYYY-MM-DD' 或 None"""
    if pd.isna(cell) or cell == '' or cell is None:
        return None
    s = str(cell).strip()
    if s.startswith('='):
        s = s[1:].strip('"\'')
    # 兼容 2025-12-31 或 2025/12/31
    s = s.replace('/', '-')
    if len(s) == 10 and s[4] == '-' and s[7] == '-':
        return s
    return None


def _date_to_quarter(date_str):
    """将日期转为季度显示：2024-03-31 -> 2024Q1, 2024-06-30 -> 2024Q2 等"""
    if not date_str or len(date_str) < 10:
        return date_str or ''
    year = date_str[:4]
    end = date_str[5:]  # MM-DD
    if end == '03-31':
        return f'{year}Q1'
    if end == '06-30':
        return f'{year}Q2'
    if end == '09-30':
        return f'{year}Q3'
    if end == '12-31':
        return f'{year}Q4'
    # 其他日期按月份推断
    try:
        m = int(date_str[5:7])
        q = (m - 1) // 3 + 1
        return f'{year}Q{q}'
    except (ValueError, TypeError):
        return date_str


def build_profitability_df(input_file):
    """
    从利润表原始文件构建「盈利能力」Tab 数据。
    提取：营业收入、毛利(营业收入-营业成本)、毛利率、归母净利润。
    当存在季度/半年日期(非12-31)时，计算单季数据：Q1=3-31，Q2=6-30-3-31，Q3=9-30-6-30，Q4=12-31-9-30。
    累计与单季之间空两列，时间竖排，金额转为亿。
    返回：pd.DataFrame，含表头，可直接写入 Excel。
    """
    if input_file.endswith('.csv'):
        df = pd.read_csv(input_file, encoding='utf-8')
    else:
        df = pd.read_excel(input_file)

    # 日期行：第一列名为「日期」
    date_row_idx = None
    for i in range(min(5, len(df))):
        if str(df.iloc[i, 0]).strip() == '日期':
            date_row_idx = i
            break
    if date_row_idx is None:
        print("盈利能力: 未找到日期行，跳过")
        return None

    # 数据列从第 4 列开始（跳过 财报类型/日期/股票代码/上市公司 等前几列）
    data_start_col = 3
    row_names = df.iloc[:, 0].astype(str).str.strip()

    def get_row_values(row_name):
        idx = row_names[row_names == row_name].index
        if len(idx) == 0:
            return None
        row = df.iloc[idx[0]]
        return [_parse_cell_number(row.iloc[c]) for c in range(data_start_col, len(row))]

    def get_dates():
        row = df.iloc[date_row_idx]
        return [_parse_date_cell(row.iloc[c]) for c in range(data_start_col, len(row))]

    dates = get_dates()
    rev = get_row_values('营业收入')
    cost = get_row_values('营业成本')
    # 毛利率优先用报表里的「毛利率(GM)」，没有则用 毛利/营业收入 算
    gross_margin = get_row_values('毛利率(GM)')
    net_parent = get_row_values('归属于母公司股东及其他权益持有者的净利润')
    if net_parent is None:
        net_parent = get_row_values('归属于母公司普通股股东的净利润')

    if rev is None or cost is None or net_parent is None:
        print("盈利能力: 缺少营业收入/营业成本/归母净利润行，跳过")
        return None

    n = len(dates)
    # 毛利 = 营业收入 - 营业成本
    gross_profit = []
    for i in range(n):
        r, c = rev[i], cost[i]
        if r is not None and c is not None:
            gross_profit.append(r - c)
        else:
            gross_profit.append(None)

    # 若没有毛利率行，用 毛利/营业收入
    if gross_margin is None:
        gross_margin = []
        for i in range(n):
            if gross_profit[i] is not None and rev[i] is not None and rev[i] != 0:
                gross_margin.append(gross_profit[i] / rev[i])
            else:
                gross_margin.append(None)
    else:
        gross_margin = list(gross_margin)

    # 按日期过滤有效行，并排序（时间竖排：从旧到新）
    YI = 100000000  # 亿

    rows_cum = []
    for i in range(n):
        if dates[i] is None:
            continue
        rows_cum.append({
            'date': dates[i],
            'rev': rev[i],
            'gross': gross_profit[i],
            'margin': gross_margin[i],
            'net_parent': net_parent[i],
        })

    if not rows_cum:
        return None

    # 按日期降序（最新在前）
    rows_cum.sort(key=lambda x: x['date'], reverse=True)

    # 判断是否为季度/半年数据（存在非 12-31 的期末日）
    has_quarterly = any(
        (r['date'] or '').endswith('-03-31') or
        (r['date'] or '').endswith('-06-30') or
        (r['date'] or '').endswith('-09-30')
        for r in rows_cum
    )

    # 累计转亿
    def to_yi(v):
        if v is None:
            return None
        return round(v / YI, 2)

    result_rows = []
    # 表头
    if has_quarterly:
        header = ['时间', '营业收入(亿)', '毛利(亿)', '毛利率', '归母净利润(亿)', '', '', '营业收入(亿)', '毛利(亿)', '毛利率', '归母净利润(亿)']
        sub_header = ['', '累计', '累计', '累计', '累计', '', '', '单季', '单季', '单季', '单季']
    else:
        header = ['时间', '营业收入(亿)', '毛利(亿)', '毛利率', '归母净利润(亿)']
        sub_header = None

    result_rows.append(header)
    if sub_header is not None:
        result_rows.append(sub_header)

    if not has_quarterly:
        for r in rows_cum:
            result_rows.append([
                _date_to_quarter(r['date']),
                to_yi(r['rev']),
                to_yi(r['gross']),
                round(r['margin'], 4) if r['margin'] is not None else None,
                to_yi(r['net_parent']),
            ])
        return pd.DataFrame(result_rows)

    # 按年分组，计算单季
    from collections import defaultdict
    by_year = defaultdict(list)
    for r in rows_cum:
        y = (r['date'] or '')[:4]
        if y:
            by_year[y].append(r)

    # 单季规则：Q1=3-31, Q2=6-30-3-31, Q3=9-30-6-30, Q4=12-31-9-30；同年内按 3-31,6-30,9-30,12-31 排序
    quarter_order = ['-03-31', '-06-30', '-09-30', '-12-31']
    single_quarter = []  # 与 rows_cum 对齐：每个期末日对应一行，单季数据按期末日填
    date_to_single = {}  # date -> { rev, gross, margin, net_parent }

    for year in sorted(by_year.keys()):
        items = by_year[year]
        items_sorted = sorted(items, key=lambda x: (x['date'] or ''))

        prev_rev = prev_gross = prev_net = None
        for item in items_sorted:
            d = item['date']
            rev_val, gross_val, net_val = item['rev'], item['gross'], item['net_parent']
            if d.endswith('-03-31'):
                sq_rev = rev_val
                sq_gross = gross_val
                sq_net = net_val
            else:
                if prev_rev is None:
                    sq_rev = sq_gross = sq_net = None
                else:
                    sq_rev = rev_val - prev_rev if rev_val is not None and prev_rev is not None else None
                    sq_gross = gross_val - prev_gross if gross_val is not None and prev_gross is not None else None
                    sq_net = net_val - prev_net if net_val is not None and prev_net is not None else None
            prev_rev, prev_gross, prev_net = rev_val, gross_val, net_val
            sq_margin = (sq_gross / sq_rev) if (sq_rev and sq_rev != 0 and sq_gross is not None) else None
            date_to_single[d] = {'rev': sq_rev, 'gross': sq_gross, 'margin': sq_margin, 'net_parent': sq_net}

    for r in rows_cum:
        d = r['date']
        sq = date_to_single.get(d, {})
        result_rows.append([
            _date_to_quarter(d),
            to_yi(r['rev']),
            to_yi(r['gross']),
            round(r['margin'], 4) if r['margin'] is not None else None,
            to_yi(r['net_parent']),
            '', '',
            to_yi(sq.get('rev')),
            to_yi(sq.get('gross')),
            round(sq['margin'], 4) if sq.get('margin') is not None else None,
            to_yi(sq.get('net_parent')),
        ])

    return pd.DataFrame(result_rows)


def process_income_statement_return_df(input_file, config_file):
    """
    处理利润表文件并返回 DataFrame

    参数：
        input_file: 输入的CSV/Excel文件路径
        config_file: 配置文件路径

    返回：
        pandas.DataFrame: 处理后的数据框
    """
    print(f"正在读取文件: {input_file}")

    # 读取数据文件
    if input_file.endswith('.csv'):
        df = pd.read_csv(input_file, encoding='utf-8')
    else:
        df = pd.read_excel(input_file)

    print(f"文件读取成功，共 {len(df)} 行")

    # 加载配置文件
    config = load_config(config_file)
    rows_to_delete = config.get('rows_to_delete', [])
    print(f"配置文件加载成功，需要删除 {len(rows_to_delete)} 种行")

    # 记录要删除的行索引
    indices_to_delete = set()

    # 查找并标记要删除的行
    for row_name in rows_to_delete:
        row_data = find_row_by_name(df, row_name)
        if row_data is not None and len(row_data) > 0:
            for idx in row_data.index:
                indices_to_delete.add(idx)
                print(f"  标记删除: {row_name}")

    print(f"\n共标记 {len(indices_to_delete)} 行待删除")

    # 创建结果数据框
    result_rows = []

    # 首先添加日期行
    date_row = find_row_by_name(df, '日期')
    date_idx = None
    if date_row is not None and len(date_row) > 0:
        date_idx = date_row.index[0]
        result_rows.append(df.iloc[date_idx].tolist())
        indices_to_delete.add(date_idx)
        print("已添加日期行作为第一行")
    else:
        print("警告: 未找到日期行")

    # 处理其他行
    print("\n开始处理数据...")
    processed_count = 0

    for idx in range(len(df)):
        # 跳过要删除的行
        if idx in indices_to_delete:
            continue

        row = df.iloc[idx].tolist()
        processed_row = []

        # 处理每一列
        for col_idx, value in enumerate(row):
            if col_idx == 0:
                # 第一列是名称，不处理
                processed_row.append(value)
            else:
                # 其他列，转换大数字为亿
                converted_value = convert_to_yi(value)
                processed_row.append(converted_value)

        result_rows.append(processed_row)
        processed_count += 1

    print(f"共处理 {processed_count} 行数据")

    # 创建结果数据框（不使用列名）
    result_df = pd.DataFrame(result_rows)

    print("\n处理完成！")
    print(f"总行数: {len(result_df)}")

    return result_df


def process_income_statement(input_file, config_file, output_file):
    """
    处理利润表文件并保存

    参数：
        input_file: 输入的CSV/Excel文件路径
        config_file: 配置文件路径
        output_file: 输出文件路径
    """
    # 调用处理函数获取 DataFrame
    result_df = process_income_statement_return_df(input_file, config_file)

    # 保存结果
    print(f"\n正在保存结果到: {output_file}")

    # 强制使用 xlsx 格式
    if not output_file.endswith('.xlsx'):
        output_file = output_file.rsplit('.', 1)[0] + '.xlsx'

    # 保存时不写入列名和索引
    result_df.to_excel(output_file, index=False, header=False, engine='openpyxl')

    print(f"输出文件: {output_file}")


def main():
    """主函数"""
    # 获取脚本所在目录
    script_dir = Path(__file__).parent

    # 设置路径
    input_dir = script_dir / 'input'
    config_dir = script_dir / 'config'
    output_dir = script_dir / 'output'

    # 创建输出目录
    output_dir.mkdir(exist_ok=True)

    # 配置文件路径
    config_file = config_dir / 'income_statement_config.json'

    if not config_file.exists():
        print(f"错误: 配置文件不存在: {config_file}")
        return

    # 查找input目录下的文件
    if not input_dir.exists():
        print(f"错误: input目录不存在: {input_dir}")
        return

    # 获取所有CSV和Excel文件（可以根据文件名筛选利润表）
    input_files = []
    for pattern in ['*利润表*.csv', '*利润表*.xlsx', '*利润表*.xls',
                    '*income*.csv', '*income*.xlsx', '*income*.xls']:
        input_files.extend(list(input_dir.glob(pattern)))

    # 去重
    input_files = list(set(input_files))

    if not input_files:
        print(f"警告: 在 {input_dir} 目录下未找到利润表文件")
        print("提示: 文件名应包含'利润表'或'income'")
        print("\n如需处理所有文件，请修改脚本中的文件匹配规则")

        # 备选：处理所有文件
        all_files = list(input_dir.glob('*.csv')) + \
                   list(input_dir.glob('*.xlsx')) + \
                   list(input_dir.glob('*.xls'))

        if all_files:
            print(f"\n发现 {len(all_files)} 个文件，是否全部处理？")
            input_files = all_files

    if not input_files:
        print("未找到任何文件")
        return

    print(f"找到 {len(input_files)} 个文件待处理\n")

    # 处理每个文件
    for input_file in input_files:
        print(f"\n{'='*60}")
        print(f"处理文件: {input_file.name}")
        print(f"{'='*60}")

        # 生成输出文件名（统一使用 .xlsx 扩展名，去除"合并报表_"）
        base_name = input_file.stem  # 不带扩展名的文件名
        # 清理文件名
        import re
        base_name = base_name.replace('合并报表_', '').replace('合并报表', '')
        base_name = re.sub(r'_+', '_', base_name).strip('_')
        output_file = output_dir / f"{base_name}.xlsx"

        try:
            process_income_statement(
                str(input_file),
                str(config_file),
                str(output_file)
            )
        except Exception as e:
            print(f"处理文件时出错: {e}")
            import traceback
            traceback.print_exc()

    print(f"\n{'='*60}")
    print("所有文件处理完成！")
    print(f"结果保存在: {output_dir}")
    print(f"{'='*60}")


if __name__ == '__main__':
    main()