o
    h*                     @   s   d dl Z d dlZd dlmZmZmZmZmZmZ d dl	m
Z
 dddddZddd	Zdd
dZdddZdddddZdddddZdS )    N)_flatten_dense_tensors_get_device_index_handle_complex_reorder_tensors_as_take_tensors_unflatten_dense_tensors)nccl)outc                C   s^   t | } |du |du A std| d| |dur(dd |D }tj| |S tj| |S )a  Broadcasts a tensor to specified GPU devices.

    Args:
        tensor (Tensor): tensor to broadcast. Can be on CPU or GPU.
        devices (Iterable[torch.device, str or int], optional): an iterable of
          GPU devices, among which to broadcast.
        out (Sequence[Tensor], optional, keyword-only): the GPU tensors to
          store output results.

    .. note::
        Exactly one of :attr:`devices` and :attr:`out` must be specified.

    Returns:
        - If :attr:`devices` is specified,
            a tuple containing copies of :attr:`tensor`, placed on
            :attr:`devices`.
        - If :attr:`out` is specified,
            a tuple containing :attr:`out` tensors, each containing a copy of
            :attr:`tensor`.
    NzFExactly one of 'devices' and 'out' must be specified, but got devices=z	 and out=c                 S      g | ]}t |qS  r   .0dr   r   j/var/www/html/construction_image-detection-poc/venv/lib/python3.10/site-packages/torch/nn/parallel/comm.py
<listcomp>+       zbroadcast.<locals>.<listcomp>)r   RuntimeErrortorch_C
_broadcast_broadcast_out)tensordevicesr	   r   r   r   	broadcast   s   r      c                 C   s,   dd |D }dd | D } t j| ||S )a.  Broadcast a sequence of tensors to the specified GPUs.

    Small tensors are first coalesced into a buffer to reduce the number of synchronizations.

    Args:
        tensors (sequence): tensors to broadcast. Must be on the same device,
          either CPU or GPU.
        devices (Iterable[torch.device, str or int]): an iterable of GPU
          devices, among which to broadcast.
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple containing copies of :attr:`tensor`, placed on :attr:`devices`.
    c                 S   r
   r   r   r   r   r   r   r   @   r   z'broadcast_coalesced.<locals>.<listcomp>c                 S   r
   r   r   r   tr   r   r   r   A   r   )r   r   _broadcast_coalesced)tensorsr   buffer_sizer   r   r   broadcast_coalesced1   s   r"   c                    sb  t |dd}| d  }d t| D ]?\}}|jjdks J d| |kr(| | |krQddd	 | D }dd
d	 |D }td| d| d| q du rZtdt	| dkrd| d S t
| rzt|   }t
j| | d |S t|   jj|} fddt| D }	|   |	d j|dd }|	dd D ]}
||
j|dd q|S )a  Sum tensors from multiple GPUs.

    All inputs should have matching shapes, dtype, and layout. The output tensor
    will be of the same shape, dtype, and layout.

    Args:
        inputs (Iterable[Tensor]): an iterable of tensors to add.
        destination (int, optional): a device on which the output will be
            placed (default: current device).

    Returns:
        A tensor containing an elementwise sum of all inputs, placed on the
        :attr:`destination` device.
    T)optionalr   Ncpuz+reduce_add expects all inputs to be on GPUsxc                 s       | ]}t |V  qd S Nstrr   r%   r   r   r   	<genexpr>\       zreduce_add.<locals>.<genexpr>c                 s   r&   r'   r(   r*   r   r   r   r+   ]   r,   zinput z has invalid size: got z, but expected zLreduce_add expects destination to be on the same GPU with one of the tensors   )outputrootc                    s   g | ]
\}}| kr|qS r   r   )r   ir   
root_indexr   r   r   n   s    zreduce_add.<locals>.<listcomp>)devicenon_blocking)r   size	enumerater3   type
get_devicejoin
ValueErrorr   lenr   is_availabler   
empty_likereducetoadd_)inputsdestination
input_sizer0   inpgotexpectedresultdestination_devicenonroototherr   r1   r   
reduce_addE   s@   

rK   c                    s   dd | D }g }g }t |  D ]<}tdd |D r,t||}|| ||d  qt ||D ]\}}	||	jr>|	 n|	 q1||d d  q fdd|D }
t |
 D ]}dd |D }t||}t||d D ]}	||	j qnqYtt	||S )	a\  Sum tensors from multiple GPUs.

    Small tensors are first coalesced into a buffer to reduce the number
    of synchronizations.

    Args:
        inputs (Iterable[Iterable[Tensor]]): iterable of iterables that
            contain tensors from a single device.
        destination (int, optional): a device on which the output will be
            placed (default: current device).
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple of tensors containing an elementwise sum of each group of
        inputs, placed on the ``destination`` device.
    c                 S   s   g | ]}g qS r   r   )r   _r   r   r   r      s    z(reduce_add_coalesced.<locals>.<listcomp>c                 s   s    | ]}|j V  qd S r'   )	is_sparser   r   r   r   r+      s    z'reduce_add_coalesced.<locals>.<genexpr>r   c                    s   g | ]}t | qS r   )r   )r   r    r!   r   r   r      s    c                 S   r
   r   )r   )r   chunkr   r   r   r      s    )
zipallrK   appendrM   to_denser   datatupler   )rA   rB   r!   dense_tensorsr.   	ref_ordertensor_at_gpusrG   collr   itrschunksflat_tensorsflat_resultr   rO   r   reduce_add_coalescedx   s*   


r_   c                C   sx   t | } |du rdd |D }ttj| ||||S |dur&td| |dur1td| ttj| |||S )a<  Scatters tensor across multiple GPUs.

    Args:
        tensor (Tensor): tensor to scatter. Can be on CPU or GPU.
        devices (Iterable[torch.device, str or int], optional): an iterable of
          GPU devices, among which to scatter.
        chunk_sizes (Iterable[int], optional): sizes of chunks to be placed on
          each device. It should match :attr:`devices` in length and sums to
          ``tensor.size(dim)``. If not specified, :attr:`tensor` will be divided
          into equal chunks.
        dim (int, optional): A dimension along which to chunk :attr:`tensor`.
          Default: ``0``.
        streams (Iterable[torch.cuda.Stream], optional): an iterable of Streams, among
          which to execute the scatter. If not specified, the default stream will
          be utilized.
        out (Sequence[Tensor], optional, keyword-only): the GPU tensors to
          store output results. Sizes of these tensors must match that of
          :attr:`tensor`, except for :attr:`dim`, where the total size must
          sum to ``tensor.size(dim)``.

    .. note::
        Exactly one of :attr:`devices` and :attr:`out` must be specified. When
        :attr:`out` is specified, :attr:`chunk_sizes` must not be specified and
        will be inferred from sizes of :attr:`out`.

    Returns:
        - If :attr:`devices` is specified,
            a tuple containing chunks of :attr:`tensor`, placed on
            :attr:`devices`.
        - If :attr:`out` is specified,
            a tuple containing :attr:`out` tensors, each containing a chunk of
            :attr:`tensor`.
    Nc                 S   r
   r   r   r   r   r   r   r      r   zscatter.<locals>.<listcomp>zI'devices' must not be specified when 'out' is specified, but got devices=zQ'chunk_sizes' must not be specified when 'out' is specified, but got chunk_sizes=)r   rV   r   r   _scatterr   _scatter_out)r   r   chunk_sizesdimstreamsr	   r   r   r   scatter   s   "re   c                C   sr   dd | D } |du r&|dkrt jdtdd t|ddd	}tj| ||S |dur1td
| tj| ||S )a  Gathers tensors from multiple GPU devices.

    Args:
        tensors (Iterable[Tensor]): an iterable of tensors to gather.
          Tensor sizes in all dimensions other than :attr:`dim` have to match.
        dim (int, optional): a dimension along which the tensors will be
          concatenated. Default: ``0``.
        destination (torch.device, str, or int, optional): the output device.
          Can be CPU or CUDA. Default: the current CUDA device.
        out (Tensor, optional, keyword-only): the tensor to store gather result.
          Its sizes must match those of :attr:`tensors`, except for :attr:`dim`,
          where the size must equal ``sum(tensor.size(dim) for tensor in tensors)``.
          Can be on CPU or CUDA.

    .. note::
        :attr:`destination` must not be specified when :attr:`out` is specified.

    Returns:
        - If :attr:`destination` is specified,
            a tensor located on :attr:`destination` device, that is a result of
            concatenating :attr:`tensors` along :attr:`dim`.
        - If :attr:`out` is specified,
            the :attr:`out` tensor, now containing results of concatenating
            :attr:`tensors` along :attr:`dim`.
    c                 S   r
   r   r   r   r   r   r   r      r   zgather.<locals>.<listcomp>NrN   zjUsing -1 to represent CPU tensor is deprecated. Please use a device object or string instead, e.g., "cpu".   )
stacklevelT)	allow_cpur#   zQ'destination' must not be specified when 'out' is specified, but got destination=)	warningswarnFutureWarningr   r   r   _gatherr   _gather_out)r    rc   rB   r	   r   r   r   gather   s   rn   r'   )r   )Nr   )NNr   N)r   N)ri   r   torch._utilsr   r   r   r   r   r   
torch.cudar   r   r"   rK   r_   re   rn   r   r   r   r   <module>   s    
!

3/2