Compare commits

..

4 Commits

Author SHA1 Message Date
afei A
8ee25380e1 修改 README 2026-03-19 16:30:40 +08:00
afei A
2fb2597ce1 feat: 保存失败的 Ticket ID 到 CSV,支持重试 2026-03-19 16:25:40 +08:00
afei A
bc6c758d8c feat: 兼容 xlsx 文件读取 Ticket ID,取消数量限制 2026-03-19 15:39:32 +08:00
afei A
fb97d46d98 chore: 更新设置文件,移除 datasource csv 2026-03-19 15:34:04 +08:00
6 changed files with 75 additions and 63 deletions

View File

@@ -6,7 +6,9 @@
"Bash(python -c \"import ast, sys; ast.parse\\(open\\('sap-c4c-AttachmentFolder.py'\\).read\\(\\)\\); print\\('语法检查通过'\\)\")", "Bash(python -c \"import ast, sys; ast.parse\\(open\\('sap-c4c-AttachmentFolder.py'\\).read\\(\\)\\); print\\('语法检查通过'\\)\")",
"Bash(git commit:*)", "Bash(git commit:*)",
"Bash(head:*)", "Bash(head:*)",
"Bash(python3:*)" "Bash(python3:*)",
"Bash(curl -sI \"https://cdn.playwright.dev/chrome-for-testing-public/145.0.7632.6/linux64/chrome-headless-shell-linux64.zip\")",
"Bash(pip3 show:*)"
] ]
} }
} }

View File

@@ -32,6 +32,7 @@ python -m playwright install-deps chromium
| `datasource/` | 存放 SAP Analytics 导出的 CSV 文件 | | `datasource/` | 存放 SAP Analytics 导出的 CSV 文件 |
| `downloads/` | 本地临时下载目录(上传 DSM 后自动清理) | | `downloads/` | 本地临时下载目录(上传 DSM 后自动清理) |
| `error_log.txt` | 错误日志,每次批量运行时重置 | | `error_log.txt` | 错误日志,每次批量运行时重置 |
| `failed_tickets.csv` | 失败的 Ticket ID 列表,可用于下次重试 |
--- ---
@@ -83,11 +84,37 @@ python sap-c4c-AttachmentFolder.py ... --json
python batch_download.py python batch_download.py
``` ```
- 默认读取 CSV 第二列的 Ticket ID,取前 10 个 - 读取 datasource 目录下 CSV/xlsx 的第二列,获取所有 Ticket ID
- 最多 5 个任务并行执行 - 最多 5 个任务并行执行
- 每个 Ticket 使用独立子目录 `downloads/{ticket_id}`,避免并行冲突 - 每个 Ticket 使用独立子目录 `downloads/{ticket_id}`,避免并行冲突
- 上传 DSM 完成后自动清理本地文件和子目录 - 上传 DSM 完成后自动清理本地文件和子目录
- 错误记录到 `error_log.txt` - 错误记录到 `error_log.txt`,失败的 Ticket ID 保存到 `failed_tickets.csv`
### 守护进程
```shell
#启动后台下载
nohup python3 batch_download.py > run.log 2>&1 &
echo $! > batch.pid
#查看实时进度
tail -f run.log
#查看进程是否还在运行
cat batch.pid | xargs ps -p
#结束进程
cat batch.pid | xargs kill
```
### 重试失败的 Ticket
将上一次运行生成的 `failed_tickets.csv` 移动到 `datasource/` 目录,然后重新运行:
```bash
mv failed_tickets.csv datasource/
python batch_download.py
```
### 修改并行数或 Ticket 数量 ### 修改并行数或 Ticket 数量

View File

@@ -24,29 +24,45 @@ WORKERS = 5
SCRIPT = os.path.join(os.path.dirname(__file__), "sap-c4c-AttachmentFolder.py") SCRIPT = os.path.join(os.path.dirname(__file__), "sap-c4c-AttachmentFolder.py")
ERROR_LOG = os.path.join(os.path.dirname(__file__), "error_log.txt") ERROR_LOG = os.path.join(os.path.dirname(__file__), "error_log.txt")
FAILED_CSV = os.path.join(os.path.dirname(__file__), "failed_tickets.csv")
DATASOURCE = os.path.join(os.path.dirname(__file__), "datasource") DATASOURCE = os.path.join(os.path.dirname(__file__), "datasource")
print_lock = threading.Lock() print_lock = threading.Lock()
failed_lock = threading.Lock()
failed_ids = set()
# ───────────────────────────────────────────────────────────────────────────── # ─────────────────────────────────────────────────────────────────────────────
def get_ticket_ids(limit=10): def get_ticket_ids():
ids, seen = [], set() ids, seen = [], set()
def _add(val):
val = str(val).strip()
if val and val not in seen:
seen.add(val)
ids.append(val)
for csv_file in glob.glob(os.path.join(DATASOURCE, "*.csv")): for csv_file in glob.glob(os.path.join(DATASOURCE, "*.csv")):
with open(csv_file, encoding="utf-8-sig") as f: with open(csv_file, encoding="utf-8-sig") as f:
reader = csv.reader(f) reader = csv.reader(f)
next(reader, None) next(reader, None)
for row in reader: for row in reader:
if len(row) < 2: if len(row) >= 2:
continue _add(row[1])
val = row[1].strip()
if val and val not in seen: for xlsx_file in glob.glob(os.path.join(DATASOURCE, "*.xlsx")):
seen.add(val) import openpyxl
ids.append(val) wb = openpyxl.load_workbook(xlsx_file, read_only=True, data_only=True)
if len(ids) >= limit: ws = wb.active
break first = True
if len(ids) >= limit: for row in ws.iter_rows(min_col=2, max_col=2, values_only=True):
break if first:
first = False
continue
if row[0] is not None:
_add(row[0])
wb.close()
return ids return ids
@@ -54,6 +70,11 @@ def log_error(ticket_id, message):
ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
with open(ERROR_LOG, "a", encoding="utf-8") as f: with open(ERROR_LOG, "a", encoding="utf-8") as f:
f.write(f"[{ts}] Ticket {ticket_id}: {message}\n") f.write(f"[{ts}] Ticket {ticket_id}: {message}\n")
with failed_lock:
if ticket_id not in failed_ids:
failed_ids.add(ticket_id)
with open(FAILED_CSV, "a", encoding="utf-8", newline="") as f:
csv.writer(f).writerow([ticket_id])
def run_ticket(ticket_id, index, total): def run_ticket(ticket_id, index, total):
@@ -117,16 +138,19 @@ def run_ticket(ticket_id, index, total):
def main(): def main():
global failed_ids
print("读取 Ticket ID ...") print("读取 Ticket ID ...")
ids = get_ticket_ids(10) ids = get_ticket_ids()
if not ids: if not ids:
print("未找到任何 Ticket ID请检查 datasource 目录") print("未找到任何 Ticket ID请检查 datasource 目录")
sys.exit(1) sys.exit(1)
print(f"{len(ids)} 个 Ticket: {', '.join(ids)}") print(f"{len(ids)} 个 Ticket: {', '.join(ids)}")
# 清空/创建 error_log # 清空/创建 error_log 和 failed_tickets.csv
open(ERROR_LOG, "w").close() open(ERROR_LOG, "w").close()
open(FAILED_CSV, "w", encoding="utf-8", newline="").close()
failed_ids.clear()
with ThreadPoolExecutor(max_workers=WORKERS) as executor: with ThreadPoolExecutor(max_workers=WORKERS) as executor:
futures = {executor.submit(run_ticket, tid, i, len(ids)): tid futures = {executor.submit(run_ticket, tid, i, len(ids)): tid
@@ -135,10 +159,12 @@ def main():
future.result() # 触发异常传播(已在 run_ticket 内处理) future.result() # 触发异常传播(已在 run_ticket 内处理)
print("\n全部完成。") print("\n全部完成。")
if os.path.getsize(ERROR_LOG) > 0: if failed_ids:
print(f"有错误,详见 {ERROR_LOG}") print(f"失败 {len(failed_ids)} 个 Ticket已保存到 {FAILED_CSV}")
if os.path.getsize(ERROR_LOG) > 0:
print(f"错误详情见 {ERROR_LOG}")
else: else:
print("无错误") print("全部成功,无失败")
if __name__ == "__main__": if __name__ == "__main__":

Binary file not shown.

File diff suppressed because one or more lines are too long

View File

@@ -1,5 +1,6 @@
requests requests
urllib3 urllib3
openpyxl
# 可选Salesforce 外链附件爬取 # 可选Salesforce 外链附件爬取
scrapling[all] scrapling[all]
playwright playwright