@misc{hamilton2024separating, title={Separating the "Chirp" from the "Chat": Self-supervised Visual Grounding of Sound and Language}, author={Mark Hamilton and Andrew Zisserman and John R. Hershey and William T. Freeman}, year={2024}, eprint={2406.05629}, archivePrefix={arXiv}, primaryClass={cs.CV} }