一、主要库选择
1. openpyxl (推荐用于.xlsx文件)
from openpyxl import Workbook, load_workbook
from openpyxl.styles import Font, Alignment, PatternFill
from openpyxl.utils import get_column_letter
# 基本操作
wb = load_workbook('data.xlsx')
ws = wb.active
# 读写数据
cell_value = ws['A1'].value
ws['B2'] = "新数据"
ws.cell(row=3, column=4, value=100)
# 保存
wb.save('output.xlsx')
2. pandas (数据处理首选)
import pandas as pd
# 读取Excel
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')
# 数据处理
df_filtered = df[df['销售额'] > 1000]
df_grouped = df.groupby('部门')['销售额'].sum()
# 写入Excel
df.to_excel('output.xlsx', index=False)
# 多表操作
with pd.ExcelWriter('multi_sheet.xlsx') as writer:
df1.to_excel(writer, sheet_name='Sheet1')
df2.to_excel(writer, sheet_name='Sheet2')
3. xlwings (需要与Excel交互)
import xlwings as xw
# 连接Excel应用
app = xw.App(visible=False) # 不可见模式
wb = app.books.open('data.xlsx')
ws = wb.sheets['Sheet1']
# 读取范围数据
data = ws.range('A1:D10').value
# 写入数据
ws.range('E1').value = [[1, 2], [3, 4]]
# 保存并关闭
wb.save()
wb.close()
app.quit()
二、常用自动化场景
1. 数据清洗与转换
import pandas as pd
def clean_excel_file(input_path, output_path):
# 读取数据
df = pd.read_excel(input_path)
# 删除空行
df = df.dropna(how='all')
# 填充缺失值
df = df.fillna({'部门': '未知', '销售额': 0})
# 数据类型转换
df['日期'] = pd.to_datetime(df['日期'])
df['金额'] = pd.to_numeric(df['金额'], errors='coerce')
# 重命名列
df = df.rename(columns={'旧列名': '新列名'})
# 删除重复行
df = df.drop_duplicates(subset=['关键列'])
# 保存结果
df.to_excel(output_path, index=False)
2. 批量处理多个文件
import os
import pandas as pd
from pathlib import Path
def batch_process_excel(folder_path, output_folder):
# 创建输出文件夹
Path(output_folder).mkdir(exist_ok=True)
# 遍历所有Excel文件
for file in Path(folder_path).glob('*.xlsx'):
try:
# 读取文件
df = pd.read_excel(file)
# 处理逻辑
df_processed = process_data(df)
# 保存结果
output_path = Path(output_folder) / f"processed_{file.name}"
df_processed.to_excel(output_path, index=False)
print(f"处理完成: {file.name}")
except Exception as e:
print(f"处理失败 {file.name}: {e}")
def process_data(df):
# 自定义处理逻辑
# 示例:计算统计指标
if '销售额' in df.columns:
df['销售占比'] = df['销售额'] / df['销售额'].sum()
return df
3. 格式设置
from openpyxl import load_workbook
from openpyxl.styles import Font, Alignment, Border, Side, PatternFill
def format_excel(file_path):
wb = load_workbook(file_path)
ws = wb.active
# 设置字体
header_font = Font(name='微软雅黑', size=12, bold=True, color='FFFFFF')
data_font = Font(name='宋体', size=11)
# 设置填充
header_fill = PatternFill(fill_type='solid',
start_color='366092',
end_color='366092')
# 设置边框
thin_border = Border(left=Side(style='thin'),
right=Side(style='thin'),
top=Side(style='thin'),
bottom=Side(style='thin'))
# 应用格式到表头
for cell in ws[1]:
cell.font = header_font
cell.fill = header_fill
cell.alignment = Alignment(horizontal='center', vertical='center')
# 应用格式到数据区域
for row in ws.iter_rows(min_row=2, max_col=ws.max_column, max_row=ws.max_row):
for cell in row:
cell.font = data_font
cell.border = thin_border
# 调整列宽
for column in ws.columns:
max_length = 0
column_letter = get_column_letter(column[0].column)
for cell in column:
try:
if len(str(cell.value)) > max_length:
max_length = len(str(cell.value))
except:
pass
adjusted_width = min(max_length + 2, 50)
ws.column_dimensions[column_letter].width = adjusted_width
wb.save('formatted.xlsx')
4. 创建报表模板
from openpyxl import Workbook
from openpyxl.chart import BarChart, Reference
def create_report(data_dict, output_path):
wb = Workbook()
ws = wb.active
ws.title = "销售报表"
# 写入表头
headers = ['部门', '季度', '销售额', '增长率']
ws.append(headers)
# 写入数据
for dept, quarters in data_dict.items():
for quarter, sales in quarters.items():
ws.append([dept, quarter, sales['sales'], sales['growth']])
# 创建图表
chart = BarChart()
chart.title = "销售情况"
chart.x_axis.title = "季度"
chart.y_axis.title = "销售额"
data = Reference(ws, min_col=3, min_row=2, max_row=ws.max_row, max_col=3)
cats = Reference(ws, min_col=2, min_row=2, max_row=ws.max_row)
chart.add_data(data, titles_from_data=False)
chart.set_categories(cats)
ws.add_chart(chart, "F2")
wb.save(output_path)
三、实用技巧
1. 性能优化
# 使用openpyxl的只读模式处理大文件
from openpyxl import load_workbook
def read_large_file(file_path):
# 只读模式,节省内存
wb = load_workbook(file_path, read_only=True)
ws = wb.active
# 逐行读取
for row in ws.iter_rows(values_only=True):
yield row
wb.close()
# 使用pandas的分块读取
chunk_size = 10000
chunks = pd.read_excel('large_file.xlsx', chunksize=chunk_size)
for chunk in chunks:
process_chunk(chunk)
2. 错误处理与日志
import logging
from datetime import datetime
# 设置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(f'excel_processing_{datetime.now():%Y%m%d}.log'),
logging.StreamHandler()
]
)
def safe_excel_operation(file_path):
try:
df = pd.read_excel(file_path)
# 业务逻辑
logging.info(f"成功处理文件: {file_path}")
return True
except FileNotFoundError:
logging.error(f"文件不存在: {file_path}")
return False
except Exception as e:
logging.error(f"处理文件时出错 {file_path}: {e}")
return False
3. 自动化任务调度
import schedule
import time
from datetime import datetime
def daily_report_task():
"""每日自动生成报表"""
input_file = f"data_{datetime.now():%Y%m%d}.xlsx"
output_file = f"report_{datetime.now():%Y%m%d}.xlsx"
if os.path.exists(input_file):
generate_report(input_file, output_file)
print(f"报表已生成: {output_file}")
else:
print(f"等待数据文件: {input_file}")
# 定时任务
schedule.every().day.at("09:00").do(daily_report_task)
while True:
schedule.run_pending()
time.sleep(60)
四、最佳实践建议
选择合适的库:
- 简单读写:pandas
- 复杂格式:openpyxl
- Excel交互:xlwings
- .xls文件:xlrd/xlwt
处理大型文件:
- 使用chunksize分块读取
- 避免在内存中保留整个工作簿
- 考虑使用数据库存储中间结果
代码组织:
- 将业务逻辑与IO操作分离
- 使用配置文件管理参数
- 添加完整的错误处理
测试与验证:
- 编写单元测试验证数据处理逻辑
- 检查数据完整性和一致性
- 创建数据验证规则
这些方法涵盖了Excel自动化的主要场景,可以根据具体需求选择合适的技术方案。