From
|
p2plink, err := links.GetP2PLink(d1, d2) |
|
if err != nil { |
|
return nil, fmt.Errorf("error getting P2PLink for devices (%v, %v): %v", i, j, err) |
|
} |
|
if p2plink != links.P2PLinkUnknown { |
|
d1.Links[d2.Index] = append(d1.Links[d2.Index], P2PLink{d2, p2plink}) |
|
} |
|
|
|
nvlink, err := links.GetNVLink(d1, d2) |
|
if err != nil { |
|
return nil, fmt.Errorf("error getting NVLink for devices (%v, %v): %v", i, j, err) |
|
} |
|
if nvlink != links.P2PLinkUnknown { |
|
d1.Links[d2.Index] = append(d1.Links[d2.Index], P2PLink{d2, nvlink}) |
|
} |
What I learned is even two GPUs are connected via nvlink, we still need to get the P2PLink and accumulate the scores in allocation:
|
for _, link := range gpu0.Links[gpu1.Index] { |
|
switch link.Type { |
|
case nvml.P2PLinkCrossCPU: |
|
score += 10 |
|
case nvml.P2PLinkSameCPU: |
|
score += 20 |
|
case nvml.P2PLinkHostBridge: |
|
score += 30 |
|
case nvml.P2PLinkMultiSwitch: |
|
score += 40 |
|
case nvml.P2PLinkSingleSwitch: |
|
score += 50 |
|
case nvml.P2PLinkSameBoard: |
|
score += 60 |
|
case nvml.SingleNVLINKLink: |
|
score += 100 |
|
case nvml.TwoNVLINKLinks: |
|
score += 200 |
|
case nvml.ThreeNVLINKLinks: |
|
score += 300 |
|
case nvml.FourNVLINKLinks: |
|
score += 400 |
|
case nvml.FiveNVLINKLinks: |
|
score += 500 |
|
case nvml.SixNVLINKLinks: |
|
score += 600 |
|
case nvml.SevenNVLINKLinks: |
|
score += 700 |
|
case nvml.EightNVLINKLinks: |
|
score += 800 |
|
case nvml.NineNVLINKLinks: |
|
score += 900 |
|
case nvml.TenNVLINKLinks: |
|
score += 1000 |
|
case nvml.ElevenNVLINKLinks: |
|
score += 1100 |
|
case nvml.TwelveNVLINKLinks: |
|
score += 1200 |
|
case nvml.ThirteenNVLINKLinks: |
|
score += 1300 |
|
case nvml.FourteenNVLINKLinks: |
|
score += 1400 |
|
case nvml.FifteenNVLINKLinks: |
|
score += 1500 |
|
case nvml.SixteenNVLINKLinks: |
|
score += 1600 |
|
case nvml.SeventeenNVLINKLinks: |
|
score += 1700 |
|
case nvml.EighteenNVLINKLinks: |
|
score += 1800 |
|
} |
|
} |
|
|
|
return score |
What's the reason here? Doesn't these two GPUs are communicating through nvlink? Appreciate for the explanations.
From
go-gpuallocator/gpuallocator/device.go
Lines 117 to 131 in 8fc3087
What I learned is even two GPUs are connected via nvlink, we still need to get the P2PLink and accumulate the scores in allocation:
go-gpuallocator/gpuallocator/besteffort_policy.go
Lines 316 to 369 in 8fc3087
What's the reason here? Doesn't these two GPUs are communicating through nvlink? Appreciate for the explanations.