feat: Add multi-threaded concurrent download support

- Add ThreadPoolExecutor for parallel attachment downloads - Add --max-workers parameter to control concurrency (default: 5) - Implement thread-safe logging with Lock mechanism - Refactor _do_download to use concurrent.futures - Add _download_single_file and _download_single_link helper functions - Update CLAUDE.md with multi-threading documentation Performance improvements: - File attachments (OData) now download in parallel - Link attachments (Scrapling) now download in parallel - Configurable worker threads for different network conditions
2026-03-12 13:01:13 +08:00
parent 929d3c2ec9
commit 84273a765e
2 changed files with 103 additions and 39 deletions
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -19,7 +19,7 @@ This is a SAP C4C (Cloud for Customer) attachment downloader toolkit that retrie
 2. Fetches ServiceRequest attachments via OData endpoints:
   - `/sap/c4c/odata/v1/c4codata` - Standard C4C OData API
   - `/sap/c4c/odata/cust/v1/custticketapi` - Custom ticket API
-3. Downloads two types of attachments:
+3. Downloads two types of attachments using **multi-threaded concurrent downloads**:
   - **File attachments** (CategoryCode=2): Downloaded via OData `$value` endpoint
   - **Link attachments** (CategoryCode=3): External Salesforce links scraped using Scrapling + Playwright
 4. Handles XIssueItem-level attachments via `BO_XSRIssueItemAttachmentFolder`
@@ -30,6 +30,11 @@ This is a SAP C4C (Cloud for Customer) attachment downloader toolkit that retrie
 - `scrapling[all]` - Web scraping framework with stealth capabilities
 - `playwright` - Browser automation for downloading Salesforce attachments
 **Performance features:**
 - Multi-threaded concurrent downloads (default: 5 threads, configurable via `--max-workers`)
 - Thread-safe output logging with lock mechanism
 - Parallel processing of both file and link attachments
 **Output modes:**
 - Human-readable console output (default)
 - JSON mode (`--json`) for programmatic consumption
@@ -71,6 +76,14 @@ python sap-c4c-AttachmentFolder.py \
  --password xxx \
  --ticket 24588
 # Download with custom thread count (default: 5)
 python sap-c4c-AttachmentFolder.py \
  --tenant https://xxx.c4c.saphybriscloud.cn \
  --user admin \
  --password xxx \
  --ticket 24588 \
  --max-workers 10
 # Download with DSM upload
 python sap-c4c-AttachmentFolder.py \
  --tenant https://xxx.c4c.saphybriscloud.cn \
--- a/sap-c4c-AttachmentFolder.py
+++ b/sap-c4c-AttachmentFolder.py
@@ -45,6 +45,8 @@ import requests
 import urllib3
 import xml.etree.ElementTree as ET
 from scrapling.fetchers import StealthyFetcher
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from threading import Lock
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
@@ -68,6 +70,10 @@ DSM_USER = ""
 DSM_PASSWORD = ""
 DSM_PATH = ""
 # 多线程配置
 MAX_WORKERS = 5  # 默认并发数
 print_lock = Lock()  # 用于线程安全的打印输出
 def get_session():
    s = requests.Session()
@@ -516,48 +522,88 @@ def run(ticket_id, output_dir, list_only=False, json_mode=False):
    return result
 def _download_single_file(session, att, label, odata_url, json_mode):
    """下载单个文件附件（用于多线程）"""
    entry = {"source": label, "c4cName": att["FileName"], "type": "file", "mime": att.get("MimeType")}
    try:
        content = download_file_via_odata(session, att, odata_url)
        file_path = os.path.join(OUTPUT_DIR, att["FileName"])
        with open(file_path, "wb") as f:
            f.write(content)
        entry["savedPath"] = os.path.abspath(file_path)
        entry["savedName"] = att["FileName"]
        if not json_mode:
            with print_lock:
                print(f"    ✓ saved: {file_path}")
    except Exception as e:
        entry["error"] = str(e)
        if not json_mode:
            with print_lock:
                print(f"    ✗ OData 下载失败 ({att['FileName']}): {e}")
    return entry
 def _download_single_link(link_att, label, json_mode):
    """下载单个链接附件（用于多线程）"""
    link_url = link_att.get("LinkWebURI")
    entry = {"source": label, "c4cName": link_att["FileName"], "type": "link", "linkUrl": link_url}
    if not link_url:
        entry["error"] = "无链接地址"
        return entry
    if not json_mode:
        with print_lock:
            print(f"    {link_att['FileName']}: {link_url}")
    r = download_link_via_scrapling(link_url, link_att["FileName"])
    if r["saved"]:
        entry["savedPath"] = os.path.abspath(r["saved"])
        entry["savedName"] = os.path.basename(r["saved"])
        if not json_mode:
            with print_lock:
                print(f"      ✓ saved: {r['saved']}")
    else:
        entry["error"] = r["error"]
        if not json_mode:
            with print_lock:
                print(f"      ✗ 下载失败: {r['error']}")
    return entry
 def _do_download(session, attachments, label, odata_url, result, json_mode):
-    """执行下载并将结果追加到 result['downloadedFiles']"""
+    """执行下载并将结果追加到 result['downloadedFiles']（多线程版本）"""
    file_atts = [a for a in attachments if a["CategoryCode"] == "2"]
    link_atts = [a for a in attachments if a["CategoryCode"] == "3"]
-    # 链接附件 -> Scrapling
+    downloaded_entries = []
    for a in link_atts:
        link_url = a.get("LinkWebURI")
        if not link_url:
            continue
        if not json_mode:
            print(f"    {a['FileName']}: {link_url}")
        r = download_link_via_scrapling(link_url, a["FileName"])
        entry = {"source": label, "c4cName": a["FileName"], "type": "link", "linkUrl": link_url}
        if r["saved"]:
            entry["savedPath"] = os.path.abspath(r["saved"])
            entry["savedName"] = os.path.basename(r["saved"])
            if not json_mode:
                print(f"      saved: {r['saved']}")
        else:
            entry["error"] = r["error"]
            if not json_mode:
                print(f"      下载失败: {r['error']}")
        result["downloadedFiles"].append(entry)
-    # 文件附件 -> OData
+    # 使用线程池并发下载
-    for att in file_atts:
+    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
-        entry = {"source": label, "c4cName": att["FileName"], "type": "file", "mime": att.get("MimeType")}
+        futures = []
-        try:
+
-            content = download_file_via_odata(session, att, odata_url)
+        # 提交文件附件下载任务
-            file_path = os.path.join(OUTPUT_DIR, att["FileName"])
+        for att in file_atts:
-            with open(file_path, "wb") as f:
+            future = executor.submit(_download_single_file, session, att, label, odata_url, json_mode)
-                f.write(content)
+            futures.append(future)
-            entry["savedPath"] = os.path.abspath(file_path)
+
-            entry["savedName"] = att["FileName"]
+        # 提交链接附件下载任务
-            if not json_mode:
+        for att in link_atts:
-                print(f"    saved: {file_path}")
+            future = executor.submit(_download_single_link, att, label, json_mode)
-        except Exception as e:
+            futures.append(future)
-            entry["error"] = str(e)
+
-            if not json_mode:
+        # 收集结果
-                print(f"    OData 下载失败 ({att['FileName']}): {e}")
+        for future in as_completed(futures):
-        result["downloadedFiles"].append(entry)
+            try:
                entry = future.result()
                downloaded_entries.append(entry)
            except Exception as e:
                if not json_mode:
                    with print_lock:
                        print(f"    ✗ 下载任务异常: {e}")
    # 将结果追加到总结果中
    result["downloadedFiles"].extend(downloaded_entries)
 def main():
@@ -572,6 +618,7 @@ def main():
    parser.add_argument("--output-dir", default="downloads", help="附件保存目录 (默认: downloads)")
    parser.add_argument("--json", action="store_true", dest="json_mode", help="JSON 输出模式（供程序调用）")
    parser.add_argument("--list-only", action="store_true", help="仅列出附件清单，不下载")
    parser.add_argument("--max-workers", type=int, default=5, help="并发下载线程数 (默认: 5)")
    # 群晖 DSM 上传参数
    parser.add_argument("--dsm-url", default=os.environ.get("DSM_URL", ""),
@@ -589,13 +636,14 @@ def main():
        parser.error("必须提供 --tenant, --user, --password 参数，或设置 C4C_TENANT, C4C_USERNAME, C4C_PASSWORD 环境变量")
    # 初始化全局配置
-    global TENANT, USERNAME, PASSWORD, ODATA_C4C, ODATA_CUST, SOAP_URL
+    global TENANT, USERNAME, PASSWORD, ODATA_C4C, ODATA_CUST, SOAP_URL, MAX_WORKERS
    TENANT = args.tenant.rstrip("/")
    USERNAME = args.user
    PASSWORD = args.password
    ODATA_C4C = f"{TENANT}/sap/c4c/odata/v1/c4codata"
    ODATA_CUST = f"{TENANT}/sap/c4c/odata/cust/v1/custticketapi"
    SOAP_URL = f"{TENANT}/sap/bc/srt/scs/sap/manageattachmentfolderin"
    MAX_WORKERS = args.max_workers
    # 初始化 DSM 配置
    global DSM_URL, DSM_USER, DSM_PASSWORD, DSM_PATH
@@ -604,6 +652,9 @@ def main():
    DSM_PASSWORD = args.dsm_password
    DSM_PATH = args.dsm_path
    if not args.json_mode and not args.list_only:
        print(f"并发下载线程数: {MAX_WORKERS}")
    result = run(args.ticket, args.output_dir, args.list_only, args.json_mode)
    # 下载完成后上传到群晖 DSM