Compare commits
4 Commits
25059a50a2
...
8ee25380e1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8ee25380e1 | ||
|
|
2fb2597ce1 | ||
|
|
bc6c758d8c | ||
|
|
fb97d46d98 |
@@ -6,7 +6,9 @@
|
|||||||
"Bash(python -c \"import ast, sys; ast.parse\\(open\\('sap-c4c-AttachmentFolder.py'\\).read\\(\\)\\); print\\('语法检查通过'\\)\")",
|
"Bash(python -c \"import ast, sys; ast.parse\\(open\\('sap-c4c-AttachmentFolder.py'\\).read\\(\\)\\); print\\('语法检查通过'\\)\")",
|
||||||
"Bash(git commit:*)",
|
"Bash(git commit:*)",
|
||||||
"Bash(head:*)",
|
"Bash(head:*)",
|
||||||
"Bash(python3:*)"
|
"Bash(python3:*)",
|
||||||
|
"Bash(curl -sI \"https://cdn.playwright.dev/chrome-for-testing-public/145.0.7632.6/linux64/chrome-headless-shell-linux64.zip\")",
|
||||||
|
"Bash(pip3 show:*)"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
31
README.md
31
README.md
@@ -32,6 +32,7 @@ python -m playwright install-deps chromium
|
|||||||
| `datasource/` | 存放 SAP Analytics 导出的 CSV 文件 |
|
| `datasource/` | 存放 SAP Analytics 导出的 CSV 文件 |
|
||||||
| `downloads/` | 本地临时下载目录(上传 DSM 后自动清理) |
|
| `downloads/` | 本地临时下载目录(上传 DSM 后自动清理) |
|
||||||
| `error_log.txt` | 错误日志,每次批量运行时重置 |
|
| `error_log.txt` | 错误日志,每次批量运行时重置 |
|
||||||
|
| `failed_tickets.csv` | 失败的 Ticket ID 列表,可用于下次重试 |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -83,11 +84,37 @@ python sap-c4c-AttachmentFolder.py ... --json
|
|||||||
python batch_download.py
|
python batch_download.py
|
||||||
```
|
```
|
||||||
|
|
||||||
- 默认读取 CSV 第二列的 Ticket ID,取前 10 个
|
- 读取 datasource 目录下 CSV/xlsx 的第二列,获取所有 Ticket ID
|
||||||
- 最多 5 个任务并行执行
|
- 最多 5 个任务并行执行
|
||||||
- 每个 Ticket 使用独立子目录 `downloads/{ticket_id}`,避免并行冲突
|
- 每个 Ticket 使用独立子目录 `downloads/{ticket_id}`,避免并行冲突
|
||||||
- 上传 DSM 完成后自动清理本地文件和子目录
|
- 上传 DSM 完成后自动清理本地文件和子目录
|
||||||
- 错误记录到 `error_log.txt`
|
- 错误记录到 `error_log.txt`,失败的 Ticket ID 保存到 `failed_tickets.csv`
|
||||||
|
|
||||||
|
### 守护进程
|
||||||
|
|
||||||
|
```shell
|
||||||
|
#启动后台下载
|
||||||
|
nohup python3 batch_download.py > run.log 2>&1 &
|
||||||
|
echo $! > batch.pid
|
||||||
|
|
||||||
|
#查看实时进度
|
||||||
|
tail -f run.log
|
||||||
|
|
||||||
|
#查看进程是否还在运行
|
||||||
|
cat batch.pid | xargs ps -p
|
||||||
|
|
||||||
|
#结束进程
|
||||||
|
cat batch.pid | xargs kill
|
||||||
|
```
|
||||||
|
|
||||||
|
### 重试失败的 Ticket
|
||||||
|
|
||||||
|
将上一次运行生成的 `failed_tickets.csv` 移动到 `datasource/` 目录,然后重新运行:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mv failed_tickets.csv datasource/
|
||||||
|
python batch_download.py
|
||||||
|
```
|
||||||
|
|
||||||
### 修改并行数或 Ticket 数量
|
### 修改并行数或 Ticket 数量
|
||||||
|
|
||||||
|
|||||||
@@ -24,29 +24,45 @@ WORKERS = 5
|
|||||||
|
|
||||||
SCRIPT = os.path.join(os.path.dirname(__file__), "sap-c4c-AttachmentFolder.py")
|
SCRIPT = os.path.join(os.path.dirname(__file__), "sap-c4c-AttachmentFolder.py")
|
||||||
ERROR_LOG = os.path.join(os.path.dirname(__file__), "error_log.txt")
|
ERROR_LOG = os.path.join(os.path.dirname(__file__), "error_log.txt")
|
||||||
|
FAILED_CSV = os.path.join(os.path.dirname(__file__), "failed_tickets.csv")
|
||||||
DATASOURCE = os.path.join(os.path.dirname(__file__), "datasource")
|
DATASOURCE = os.path.join(os.path.dirname(__file__), "datasource")
|
||||||
|
|
||||||
print_lock = threading.Lock()
|
print_lock = threading.Lock()
|
||||||
|
failed_lock = threading.Lock()
|
||||||
|
failed_ids = set()
|
||||||
# ─────────────────────────────────────────────────────────────────────────────
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
def get_ticket_ids(limit=10):
|
def get_ticket_ids():
|
||||||
ids, seen = [], set()
|
ids, seen = [], set()
|
||||||
|
|
||||||
|
def _add(val):
|
||||||
|
val = str(val).strip()
|
||||||
|
if val and val not in seen:
|
||||||
|
seen.add(val)
|
||||||
|
ids.append(val)
|
||||||
|
|
||||||
for csv_file in glob.glob(os.path.join(DATASOURCE, "*.csv")):
|
for csv_file in glob.glob(os.path.join(DATASOURCE, "*.csv")):
|
||||||
with open(csv_file, encoding="utf-8-sig") as f:
|
with open(csv_file, encoding="utf-8-sig") as f:
|
||||||
reader = csv.reader(f)
|
reader = csv.reader(f)
|
||||||
next(reader, None)
|
next(reader, None)
|
||||||
for row in reader:
|
for row in reader:
|
||||||
if len(row) < 2:
|
if len(row) >= 2:
|
||||||
|
_add(row[1])
|
||||||
|
|
||||||
|
for xlsx_file in glob.glob(os.path.join(DATASOURCE, "*.xlsx")):
|
||||||
|
import openpyxl
|
||||||
|
wb = openpyxl.load_workbook(xlsx_file, read_only=True, data_only=True)
|
||||||
|
ws = wb.active
|
||||||
|
first = True
|
||||||
|
for row in ws.iter_rows(min_col=2, max_col=2, values_only=True):
|
||||||
|
if first:
|
||||||
|
first = False
|
||||||
continue
|
continue
|
||||||
val = row[1].strip()
|
if row[0] is not None:
|
||||||
if val and val not in seen:
|
_add(row[0])
|
||||||
seen.add(val)
|
wb.close()
|
||||||
ids.append(val)
|
|
||||||
if len(ids) >= limit:
|
|
||||||
break
|
|
||||||
if len(ids) >= limit:
|
|
||||||
break
|
|
||||||
return ids
|
return ids
|
||||||
|
|
||||||
|
|
||||||
@@ -54,6 +70,11 @@ def log_error(ticket_id, message):
|
|||||||
ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
with open(ERROR_LOG, "a", encoding="utf-8") as f:
|
with open(ERROR_LOG, "a", encoding="utf-8") as f:
|
||||||
f.write(f"[{ts}] Ticket {ticket_id}: {message}\n")
|
f.write(f"[{ts}] Ticket {ticket_id}: {message}\n")
|
||||||
|
with failed_lock:
|
||||||
|
if ticket_id not in failed_ids:
|
||||||
|
failed_ids.add(ticket_id)
|
||||||
|
with open(FAILED_CSV, "a", encoding="utf-8", newline="") as f:
|
||||||
|
csv.writer(f).writerow([ticket_id])
|
||||||
|
|
||||||
|
|
||||||
def run_ticket(ticket_id, index, total):
|
def run_ticket(ticket_id, index, total):
|
||||||
@@ -117,16 +138,19 @@ def run_ticket(ticket_id, index, total):
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
global failed_ids
|
||||||
print("读取 Ticket ID ...")
|
print("读取 Ticket ID ...")
|
||||||
ids = get_ticket_ids(10)
|
ids = get_ticket_ids()
|
||||||
if not ids:
|
if not ids:
|
||||||
print("未找到任何 Ticket ID,请检查 datasource 目录")
|
print("未找到任何 Ticket ID,请检查 datasource 目录")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
print(f"共 {len(ids)} 个 Ticket: {', '.join(ids)}")
|
print(f"共 {len(ids)} 个 Ticket: {', '.join(ids)}")
|
||||||
|
|
||||||
# 清空/创建 error_log
|
# 清空/创建 error_log 和 failed_tickets.csv
|
||||||
open(ERROR_LOG, "w").close()
|
open(ERROR_LOG, "w").close()
|
||||||
|
open(FAILED_CSV, "w", encoding="utf-8", newline="").close()
|
||||||
|
failed_ids.clear()
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=WORKERS) as executor:
|
with ThreadPoolExecutor(max_workers=WORKERS) as executor:
|
||||||
futures = {executor.submit(run_ticket, tid, i, len(ids)): tid
|
futures = {executor.submit(run_ticket, tid, i, len(ids)): tid
|
||||||
@@ -135,10 +159,12 @@ def main():
|
|||||||
future.result() # 触发异常传播(已在 run_ticket 内处理)
|
future.result() # 触发异常传播(已在 run_ticket 内处理)
|
||||||
|
|
||||||
print("\n全部完成。")
|
print("\n全部完成。")
|
||||||
|
if failed_ids:
|
||||||
|
print(f"失败 {len(failed_ids)} 个 Ticket,已保存到 {FAILED_CSV}")
|
||||||
if os.path.getsize(ERROR_LOG) > 0:
|
if os.path.getsize(ERROR_LOG) > 0:
|
||||||
print(f"有错误,详见 {ERROR_LOG}")
|
print(f"错误详情见 {ERROR_LOG}")
|
||||||
else:
|
else:
|
||||||
print("无错误。")
|
print("全部成功,无失败。")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
BIN
datasource/25年10月索赔报表.xlsx
Normal file
BIN
datasource/25年10月索赔报表.xlsx
Normal file
Binary file not shown.
File diff suppressed because one or more lines are too long
@@ -1,5 +1,6 @@
|
|||||||
requests
|
requests
|
||||||
urllib3
|
urllib3
|
||||||
|
openpyxl
|
||||||
# 可选:Salesforce 外链附件爬取
|
# 可选:Salesforce 外链附件爬取
|
||||||
scrapling[all]
|
scrapling[all]
|
||||||
playwright
|
playwright
|
||||||
|
|||||||
Reference in New Issue
Block a user