feat: Add multi-threaded concurrent download support
- Add ThreadPoolExecutor for parallel attachment downloads - Add --max-workers parameter to control concurrency (default: 5) - Implement thread-safe logging with Lock mechanism - Refactor _do_download to use concurrent.futures - Add _download_single_file and _download_single_link helper functions - Update CLAUDE.md with multi-threading documentation Performance improvements: - File attachments (OData) now download in parallel - Link attachments (Scrapling) now download in parallel - Configurable worker threads for different network conditions
This commit is contained in:
15
CLAUDE.md
15
CLAUDE.md
@@ -19,7 +19,7 @@ This is a SAP C4C (Cloud for Customer) attachment downloader toolkit that retrie
|
|||||||
2. Fetches ServiceRequest attachments via OData endpoints:
|
2. Fetches ServiceRequest attachments via OData endpoints:
|
||||||
- `/sap/c4c/odata/v1/c4codata` - Standard C4C OData API
|
- `/sap/c4c/odata/v1/c4codata` - Standard C4C OData API
|
||||||
- `/sap/c4c/odata/cust/v1/custticketapi` - Custom ticket API
|
- `/sap/c4c/odata/cust/v1/custticketapi` - Custom ticket API
|
||||||
3. Downloads two types of attachments:
|
3. Downloads two types of attachments using **multi-threaded concurrent downloads**:
|
||||||
- **File attachments** (CategoryCode=2): Downloaded via OData `$value` endpoint
|
- **File attachments** (CategoryCode=2): Downloaded via OData `$value` endpoint
|
||||||
- **Link attachments** (CategoryCode=3): External Salesforce links scraped using Scrapling + Playwright
|
- **Link attachments** (CategoryCode=3): External Salesforce links scraped using Scrapling + Playwright
|
||||||
4. Handles XIssueItem-level attachments via `BO_XSRIssueItemAttachmentFolder`
|
4. Handles XIssueItem-level attachments via `BO_XSRIssueItemAttachmentFolder`
|
||||||
@@ -30,6 +30,11 @@ This is a SAP C4C (Cloud for Customer) attachment downloader toolkit that retrie
|
|||||||
- `scrapling[all]` - Web scraping framework with stealth capabilities
|
- `scrapling[all]` - Web scraping framework with stealth capabilities
|
||||||
- `playwright` - Browser automation for downloading Salesforce attachments
|
- `playwright` - Browser automation for downloading Salesforce attachments
|
||||||
|
|
||||||
|
**Performance features:**
|
||||||
|
- Multi-threaded concurrent downloads (default: 5 threads, configurable via `--max-workers`)
|
||||||
|
- Thread-safe output logging with lock mechanism
|
||||||
|
- Parallel processing of both file and link attachments
|
||||||
|
|
||||||
**Output modes:**
|
**Output modes:**
|
||||||
- Human-readable console output (default)
|
- Human-readable console output (default)
|
||||||
- JSON mode (`--json`) for programmatic consumption
|
- JSON mode (`--json`) for programmatic consumption
|
||||||
@@ -71,6 +76,14 @@ python sap-c4c-AttachmentFolder.py \
|
|||||||
--password xxx \
|
--password xxx \
|
||||||
--ticket 24588
|
--ticket 24588
|
||||||
|
|
||||||
|
# Download with custom thread count (default: 5)
|
||||||
|
python sap-c4c-AttachmentFolder.py \
|
||||||
|
--tenant https://xxx.c4c.saphybriscloud.cn \
|
||||||
|
--user admin \
|
||||||
|
--password xxx \
|
||||||
|
--ticket 24588 \
|
||||||
|
--max-workers 10
|
||||||
|
|
||||||
# Download with DSM upload
|
# Download with DSM upload
|
||||||
python sap-c4c-AttachmentFolder.py \
|
python sap-c4c-AttachmentFolder.py \
|
||||||
--tenant https://xxx.c4c.saphybriscloud.cn \
|
--tenant https://xxx.c4c.saphybriscloud.cn \
|
||||||
|
|||||||
@@ -45,6 +45,8 @@ import requests
|
|||||||
import urllib3
|
import urllib3
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
from scrapling.fetchers import StealthyFetcher
|
from scrapling.fetchers import StealthyFetcher
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
from threading import Lock
|
||||||
|
|
||||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
|
||||||
@@ -68,6 +70,10 @@ DSM_USER = ""
|
|||||||
DSM_PASSWORD = ""
|
DSM_PASSWORD = ""
|
||||||
DSM_PATH = ""
|
DSM_PATH = ""
|
||||||
|
|
||||||
|
# 多线程配置
|
||||||
|
MAX_WORKERS = 5 # 默认并发数
|
||||||
|
print_lock = Lock() # 用于线程安全的打印输出
|
||||||
|
|
||||||
|
|
||||||
def get_session():
|
def get_session():
|
||||||
s = requests.Session()
|
s = requests.Session()
|
||||||
@@ -516,48 +522,88 @@ def run(ticket_id, output_dir, list_only=False, json_mode=False):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _download_single_file(session, att, label, odata_url, json_mode):
|
||||||
|
"""下载单个文件附件(用于多线程)"""
|
||||||
|
entry = {"source": label, "c4cName": att["FileName"], "type": "file", "mime": att.get("MimeType")}
|
||||||
|
try:
|
||||||
|
content = download_file_via_odata(session, att, odata_url)
|
||||||
|
file_path = os.path.join(OUTPUT_DIR, att["FileName"])
|
||||||
|
with open(file_path, "wb") as f:
|
||||||
|
f.write(content)
|
||||||
|
entry["savedPath"] = os.path.abspath(file_path)
|
||||||
|
entry["savedName"] = att["FileName"]
|
||||||
|
if not json_mode:
|
||||||
|
with print_lock:
|
||||||
|
print(f" ✓ saved: {file_path}")
|
||||||
|
except Exception as e:
|
||||||
|
entry["error"] = str(e)
|
||||||
|
if not json_mode:
|
||||||
|
with print_lock:
|
||||||
|
print(f" ✗ OData 下载失败 ({att['FileName']}): {e}")
|
||||||
|
return entry
|
||||||
|
|
||||||
|
|
||||||
|
def _download_single_link(link_att, label, json_mode):
|
||||||
|
"""下载单个链接附件(用于多线程)"""
|
||||||
|
link_url = link_att.get("LinkWebURI")
|
||||||
|
entry = {"source": label, "c4cName": link_att["FileName"], "type": "link", "linkUrl": link_url}
|
||||||
|
|
||||||
|
if not link_url:
|
||||||
|
entry["error"] = "无链接地址"
|
||||||
|
return entry
|
||||||
|
|
||||||
|
if not json_mode:
|
||||||
|
with print_lock:
|
||||||
|
print(f" {link_att['FileName']}: {link_url}")
|
||||||
|
|
||||||
|
r = download_link_via_scrapling(link_url, link_att["FileName"])
|
||||||
|
if r["saved"]:
|
||||||
|
entry["savedPath"] = os.path.abspath(r["saved"])
|
||||||
|
entry["savedName"] = os.path.basename(r["saved"])
|
||||||
|
if not json_mode:
|
||||||
|
with print_lock:
|
||||||
|
print(f" ✓ saved: {r['saved']}")
|
||||||
|
else:
|
||||||
|
entry["error"] = r["error"]
|
||||||
|
if not json_mode:
|
||||||
|
with print_lock:
|
||||||
|
print(f" ✗ 下载失败: {r['error']}")
|
||||||
|
return entry
|
||||||
|
|
||||||
|
|
||||||
def _do_download(session, attachments, label, odata_url, result, json_mode):
|
def _do_download(session, attachments, label, odata_url, result, json_mode):
|
||||||
"""执行下载并将结果追加到 result['downloadedFiles']"""
|
"""执行下载并将结果追加到 result['downloadedFiles'](多线程版本)"""
|
||||||
file_atts = [a for a in attachments if a["CategoryCode"] == "2"]
|
file_atts = [a for a in attachments if a["CategoryCode"] == "2"]
|
||||||
link_atts = [a for a in attachments if a["CategoryCode"] == "3"]
|
link_atts = [a for a in attachments if a["CategoryCode"] == "3"]
|
||||||
|
|
||||||
# 链接附件 -> Scrapling
|
downloaded_entries = []
|
||||||
for a in link_atts:
|
|
||||||
link_url = a.get("LinkWebURI")
|
|
||||||
if not link_url:
|
|
||||||
continue
|
|
||||||
if not json_mode:
|
|
||||||
print(f" {a['FileName']}: {link_url}")
|
|
||||||
r = download_link_via_scrapling(link_url, a["FileName"])
|
|
||||||
entry = {"source": label, "c4cName": a["FileName"], "type": "link", "linkUrl": link_url}
|
|
||||||
if r["saved"]:
|
|
||||||
entry["savedPath"] = os.path.abspath(r["saved"])
|
|
||||||
entry["savedName"] = os.path.basename(r["saved"])
|
|
||||||
if not json_mode:
|
|
||||||
print(f" saved: {r['saved']}")
|
|
||||||
else:
|
|
||||||
entry["error"] = r["error"]
|
|
||||||
if not json_mode:
|
|
||||||
print(f" 下载失败: {r['error']}")
|
|
||||||
result["downloadedFiles"].append(entry)
|
|
||||||
|
|
||||||
# 文件附件 -> OData
|
# 使用线程池并发下载
|
||||||
for att in file_atts:
|
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
||||||
entry = {"source": label, "c4cName": att["FileName"], "type": "file", "mime": att.get("MimeType")}
|
futures = []
|
||||||
try:
|
|
||||||
content = download_file_via_odata(session, att, odata_url)
|
# 提交文件附件下载任务
|
||||||
file_path = os.path.join(OUTPUT_DIR, att["FileName"])
|
for att in file_atts:
|
||||||
with open(file_path, "wb") as f:
|
future = executor.submit(_download_single_file, session, att, label, odata_url, json_mode)
|
||||||
f.write(content)
|
futures.append(future)
|
||||||
entry["savedPath"] = os.path.abspath(file_path)
|
|
||||||
entry["savedName"] = att["FileName"]
|
# 提交链接附件下载任务
|
||||||
if not json_mode:
|
for att in link_atts:
|
||||||
print(f" saved: {file_path}")
|
future = executor.submit(_download_single_link, att, label, json_mode)
|
||||||
except Exception as e:
|
futures.append(future)
|
||||||
entry["error"] = str(e)
|
|
||||||
if not json_mode:
|
# 收集结果
|
||||||
print(f" OData 下载失败 ({att['FileName']}): {e}")
|
for future in as_completed(futures):
|
||||||
result["downloadedFiles"].append(entry)
|
try:
|
||||||
|
entry = future.result()
|
||||||
|
downloaded_entries.append(entry)
|
||||||
|
except Exception as e:
|
||||||
|
if not json_mode:
|
||||||
|
with print_lock:
|
||||||
|
print(f" ✗ 下载任务异常: {e}")
|
||||||
|
|
||||||
|
# 将结果追加到总结果中
|
||||||
|
result["downloadedFiles"].extend(downloaded_entries)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
@@ -572,6 +618,7 @@ def main():
|
|||||||
parser.add_argument("--output-dir", default="downloads", help="附件保存目录 (默认: downloads)")
|
parser.add_argument("--output-dir", default="downloads", help="附件保存目录 (默认: downloads)")
|
||||||
parser.add_argument("--json", action="store_true", dest="json_mode", help="JSON 输出模式(供程序调用)")
|
parser.add_argument("--json", action="store_true", dest="json_mode", help="JSON 输出模式(供程序调用)")
|
||||||
parser.add_argument("--list-only", action="store_true", help="仅列出附件清单,不下载")
|
parser.add_argument("--list-only", action="store_true", help="仅列出附件清单,不下载")
|
||||||
|
parser.add_argument("--max-workers", type=int, default=5, help="并发下载线程数 (默认: 5)")
|
||||||
|
|
||||||
# 群晖 DSM 上传参数
|
# 群晖 DSM 上传参数
|
||||||
parser.add_argument("--dsm-url", default=os.environ.get("DSM_URL", ""),
|
parser.add_argument("--dsm-url", default=os.environ.get("DSM_URL", ""),
|
||||||
@@ -589,13 +636,14 @@ def main():
|
|||||||
parser.error("必须提供 --tenant, --user, --password 参数,或设置 C4C_TENANT, C4C_USERNAME, C4C_PASSWORD 环境变量")
|
parser.error("必须提供 --tenant, --user, --password 参数,或设置 C4C_TENANT, C4C_USERNAME, C4C_PASSWORD 环境变量")
|
||||||
|
|
||||||
# 初始化全局配置
|
# 初始化全局配置
|
||||||
global TENANT, USERNAME, PASSWORD, ODATA_C4C, ODATA_CUST, SOAP_URL
|
global TENANT, USERNAME, PASSWORD, ODATA_C4C, ODATA_CUST, SOAP_URL, MAX_WORKERS
|
||||||
TENANT = args.tenant.rstrip("/")
|
TENANT = args.tenant.rstrip("/")
|
||||||
USERNAME = args.user
|
USERNAME = args.user
|
||||||
PASSWORD = args.password
|
PASSWORD = args.password
|
||||||
ODATA_C4C = f"{TENANT}/sap/c4c/odata/v1/c4codata"
|
ODATA_C4C = f"{TENANT}/sap/c4c/odata/v1/c4codata"
|
||||||
ODATA_CUST = f"{TENANT}/sap/c4c/odata/cust/v1/custticketapi"
|
ODATA_CUST = f"{TENANT}/sap/c4c/odata/cust/v1/custticketapi"
|
||||||
SOAP_URL = f"{TENANT}/sap/bc/srt/scs/sap/manageattachmentfolderin"
|
SOAP_URL = f"{TENANT}/sap/bc/srt/scs/sap/manageattachmentfolderin"
|
||||||
|
MAX_WORKERS = args.max_workers
|
||||||
|
|
||||||
# 初始化 DSM 配置
|
# 初始化 DSM 配置
|
||||||
global DSM_URL, DSM_USER, DSM_PASSWORD, DSM_PATH
|
global DSM_URL, DSM_USER, DSM_PASSWORD, DSM_PATH
|
||||||
@@ -604,6 +652,9 @@ def main():
|
|||||||
DSM_PASSWORD = args.dsm_password
|
DSM_PASSWORD = args.dsm_password
|
||||||
DSM_PATH = args.dsm_path
|
DSM_PATH = args.dsm_path
|
||||||
|
|
||||||
|
if not args.json_mode and not args.list_only:
|
||||||
|
print(f"并发下载线程数: {MAX_WORKERS}")
|
||||||
|
|
||||||
result = run(args.ticket, args.output_dir, args.list_only, args.json_mode)
|
result = run(args.ticket, args.output_dir, args.list_only, args.json_mode)
|
||||||
|
|
||||||
# 下载完成后上传到群晖 DSM
|
# 下载完成后上传到群晖 DSM
|
||||||
|
|||||||
Reference in New Issue
Block a user