ronwpubs.bib

@conference{ellis06pvocvq,
  author = {D. P. W. Ellis and R. J. Weiss},
  title = {{Model-Based Monaural Source Separation Using a Vector-Quantized Phase-Vocoder Representation}},
  booktitle = {Proc. {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
  pages = {V--957--960},
  month = may,
  year = {2006},
  address = {Toulouse, France},
  doi = {10.1109/ICASSP.2006.1661436},
  pdf = {http://www.ee.columbia.edu/~ronw/pubs/icassp2006-pvocvq.pdf}
}

@conference{weiss06rvmsep,
  author = {R. J. Weiss and D. P. W. Ellis},
  title = {{Estimating Single-Channel Source Separation Masks: Relevance Vector Machine Classifiers vs. Pitch-Based Masking}},
  booktitle = {Proc. ISCA Tutorial and Research Workshop on Statistical Perceptual Audition (SAPA)},
  pages = {31--36},
  month = sep,
  year = {2006},
  address = {Pittsburgh, USA},
  http = {http://www.isca-speech.org/archive/sapa_2006/sap6_031.html},
  pdf = {http://www.ee.columbia.edu/~ronw/pubs/sapa2006-rvmpvsourcesep.pdf},
  slides = {http://www.ee.columbia.edu/~ronw/pubs/sapa2006-rvmpvsourcesep-slides.pdf}
}

@conference{weiss07adapted_models,
  author = {R. J. Weiss and D. P. W. Ellis},
  title = {{Monaural Speech Separation Using Source-Adapted Models}},
  booktitle = {Proc. {IEEE} Workshop on Applications of Signal Processing to Audio and Acoustics ({WASPAA})},
  month = oct,
  year = 2007,
  pages = {114--117},
  address = {New Paltz, USA},
  doi = {10.1109/ASPAA.2007.4393039},
  pdf = {http://www.ee.columbia.edu/~ronw/pubs/waspaa2007-adapted_models.pdf},
  slides = {http://www.ee.columbia.edu/~ronw/pubs/waspaa2007-adapted_models-slides.pdf},
  web = {http://www.ee.columbia.edu/~ronw/SSC.html}
}

@conference{weiss08messlsp,
  author = {R. J. Weiss and M. I. Mandel and D. P. W. Ellis},
  title = {{Source Separation Based on Binaural Cues and Source Model Constraints}},
  booktitle = {Proc. Interspeech},
  pages = {419--422},
  month = sep,
  year = {2008},
  address = {Brisbane, Australia},
  http = {http://www.isca-speech.org/archive/interspeech_2008/i08_0419.html},
  pdf = {http://www.ee.columbia.edu/~ronw/pubs/icslp2008-messl_sp.pdf},
  poster = {http://www.ee.columbia.edu/~ronw/pubs/icslp2008-messl_sp-poster.pdf}
}

@conference{weiss08dysana,
  author = {R. J. Weiss and T. Kristjansson},
  title = {{{DySANA}: Dynamic Speech and Noise Adaptation for Voice Activity Detection}},
  booktitle = {Proc. Interspeech},
  pages = {127--130},
  month = sep,
  year = {2008},
  address = {Brisbane, Australia},
  http = {http://www.isca-speech.org/archive/interspeech_2008/i08_0127.html},
  pdf = {http://www.ee.columbia.edu/~ronw/pubs/icslp2008-dysana.pdf},
  poster = {http://www.ee.columbia.edu/~ronw/pubs/icslp2008-dysana-poster.pdf}
}

@conference{weiss09vem,
  author = {R. J. Weiss and D. P. W. Ellis},
  title = {{A Variational EM Algorithm for Learning Eigenvoice Parameters in Mixed Signals}},
  booktitle = {Proc. {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
  pages = {113--116},
  month = apr,
  year = 2009,
  address = {Taipei, Taiwan},
  doi = {10.1109/ICASSP.2009.4959533},
  pdf = {http://www.ee.columbia.edu/~ronw/pubs/icassp2009-ev_vem.pdf},
  poster = {http://www.ee.columbia.edu/~ronw/pubs/icassp2009-ev_vem-poster.pdf}
}

@phdthesis{weiss09thesis,
title = {Underdetermined Source Separation Using Speaker Subspace Models},
author = {R. J. Weiss},
year = {2009},
school = {Department of {E}lectrical {E}ngineering, Columbia University},
publisher = {Columbia University},
pdf = {http://www.ee.columbia.edu/~ronw/pubs/ronw-thesis.pdf},
slides = {http://www.ee.columbia.edu/~ronw/pubs/ronw-thesis-slides.pdf},
abstract = {Sounds rarely occur in isolation. Despite this, significant effort has been dedicated to the
design of computer audition systems, such as speech recognizers, that can only analyze
isolated sound sources. In fact, there are a variety of applications in both human and
computer audition for which it is desirable to understand more complex auditory scenes.
In order to extend such systems to operate on mixtures of many sources, the ability to
recover the source signals from the mixture is required. This process is known as source
separation.

In this thesis we focus on the problem of underdetermined source separation where the
number of sources is greater than the number of channels in the observed mixture. In
the worst case, when the observations are derived from a single microphone, it is often
necessary for a separation algorithm to utilize prior information about the sources present
in the mixture to constrain possible source reconstructions. A common approach for
separating such signals is based on the use of source-specific statistical models. In most
cases this approach requires that significant training data be available to train models for
the sources known in advance to be present in the mixed signal. We propose a speaker
subspace model for source adaptation that alleviates this requirement.

We report a series of experiments on monaural mixtures of speech signals and demonstrate
that the use of the proposed speaker subspace model can separate sources far better than
the use of unadapted, source-independent models. The proposed method also outperforms
other state of the art approaches when training data is not available for the exact speakers
present in the mixed signal.

Finally, we describe an system for binaural speech separation that combines constraints
based on interaural localization cues with constraints derived from source models. Although a simpler system based only on localization cues is sometimes able to adequately
isolate sources, the incorporation of a source-independent model is shown to significantly
improve performance. Further improvements are obtained by using the proposed speaker
subspace model to adapt to match the sources present in the signal.}
}

@article{weiss10ssc,
  title = {{Speech Separation Using Speaker-Adapted Eigenvoice Speech Models}},
  author = {R. J. Weiss and D. P. W. Ellis},
  journal = {Computer Speech and Language},
  month = jan,
  year = {2010},
  volume = {24},
  number = {1},
  pages = {16--29},
  note = {Speech Separation and Recognition Challenge},
  issn = {0885-2308},
  doi = {10.1016/j.csl.2008.03.003},
  pdf = {http://www.ee.columbia.edu/~ronw/pubs/csl2008-eigenvoice_speech_sep.pdf},
  abstract = {We present a system for model-based source separation for use on single channel speech mixtures where the precise source characteristics are not known a priori.  The sources are modeled using hidden Markov models (HMM) and separated using factorial HMM methods.  Without prior speaker models for the sources in the mixture it is difficult to exactly resolve the individual sources because there is no way to determine which state corresponds to which source at any point in time.  This is solved to a small extent by the temporal constraints provided by the Markov models, but permutations between sources remains a significant problem.  We overcome this by adapting the models to match the sources in the mixture.  We do this by representing the space of speaker variation with a parametric signal model based on the eigenvoice technique for rapid speaker adaptation.  We present an algorithm to infer the characteristics of the sources present in a mixture, allowing for significantly improved separation performance over that obtained using unadapted source models.  The algorithm is evaluated on the task defined in the 2006 Speech Separation Challenge and compared with separation using source-dependent models.  Although performance is not as good as with speaker-dependent models, we show that the system based on model adaptation is able to generalize better to held out speakers.}
}

@article{mandel10messl,
  title = {{Model-Based Expectation-Maximization Source Separation and Localization}},
  author = {M. I. Mandel and R. J. Weiss and D. P. W. Ellis},
  journal = {{IEEE} Transactions on Audio, Speech, and Language Processing},
  year = {2010},
  month = feb,
  volume = {18},
  number = {2},
  pages = {382--394},
  doi = {10.1109/TASL.2009.2029711},
  issn = {1558-7916},
  pdf = {http://www.ee.columbia.edu/~ronw/pubs/taslp09-messl.pdf},
  abstract = {This paper describes a system, referred to as model-based expectation-maximization source separation and localization (MESSL), for separating and localizing multiple sound sources from an underdetermined reverberant two-channel recording. By clustering individual spectrogram points based on their interaural phase and level differences, MESSL generates masks that can be used to isolate individual sound sources. We first describe a probabilistic model of interaural parameters that can be evaluated at individual spectrogram points. By creating a mixture of these models over sources and delays, the multi-source localization problem is reduced to a collection of single source problems. We derive an expectation-maximization algorithm for computing the maximum-likelihood parameters of this mixture model, and show that these parameters correspond well with interaural parameters measured in isolation. As a byproduct of fitting this mixture model, the algorithm creates probabilistic spectrogram masks that can be used for source separation. In simulated anechoic and reverberant environments, separations using MESSL produced on average a signal-to-distortion ratio 1.6 dB greater and perceptual evaluation of speech quality (PESQ) results 0.27 mean opinion score units greater than four comparable algorithms.},
  web = {http://github.com/mim/messl}
}

@inproceedings{weiss10nmfseg,
  author = {R. J. Weiss and J. P. Bello},
  title = {{Identifying Repeated Patterns in Music Using Sparse Convolutive Non-Negative Matrix Factorization}},
  booktitle = {Proc. International Society for Music Information Retrieval Conference ({ISMIR})},
  pages = {123--128},
  month = aug,
  year = 2010,
  address = {Utrecht, Netherlands},
  note = {Best Paper Award},
  pdf = {http://www.ee.columbia.edu/~ronw/pubs/ismir2010-nmfseg.pdf},
  web = {http://ronw.github.com/siplca-segmentation},
  slides = {http://www.ee.columbia.edu/~ronw/pubs/ismir2010-nmfseg-slides.pdf}
}

@inproceedings{bertin10patterns,
  author = {T. Bertin-Mahieux and R. J. Weiss and D. P. W. Ellis},
  title = {{Clustering Beat-Chroma Patterns in a Large Music Database}},
  booktitle = {Proc. International Society for Music Information Retrieval Conference ({ISMIR})},
  pages = {111--116},
  month = aug,
  year = 2010,
  address = {Utrecht, Netherlands},
  pdf = {http://www.ee.columbia.edu/~ronw/pubs/ismir2010-beatchromapatterns.pdf},
  web = {http://www.columbia.edu/~tb2332/ProjClustering/ClusteringChromas.html}
}

@inproceedings{cho10chordreco,
  author = {T. Cho and R. J. Weiss and J. P. Bello},
  title = {{Exploring Common Variations in State of the Art Chord Recognition Systems}},
  booktitle = {Proc. Sound and Music Computing Conference ({SMC})},
  pages = {1--8},
  month = jul,
  year = 2010,
  address = {Barcelona, Spain},
  pdf = {http://www.ee.columbia.edu/~ronw/pubs/smc2010-chordreco.pdf}
}

@article{weiss11siplca,
  author = {R. J. Weiss and J. P. Bello},
  title = {{Unsupervised Discovery of Temporal Structure in Music}},
  journal = {IEEE Journal of Selected Topics in Signal Processing},
  year = 2011,
  month = oct,
  volume = 5,
  number = 6,
  pages = {1240--1251},
  issn = {1932-4553},
  doi = {10.1109/JSTSP.2011.2145356},
  pdf = {http://www.ee.columbia.edu/~ronw/pubs/jstsp2011-siplca.pdf}
}

@article{weiss11messlev,
  title = {Combining Localization Cues and Source Model Constraints
           for Binaural Source Separation},
  author = {R. J. Weiss and M. I. Mandel and D. P. W. Ellis},
  journal = {Speech Communication},
  note = {Special issue on Perceptual and Statistical Audition},
  year = 2011,
  month = may,
  volume = {53},
  number = {5},
  pages = {606--621},
  issn = {0167-6393},
  doi = {10.1016/j.specom.2011.01.003},
  pdf = {http://www.ee.columbia.edu/~ronw/pubs/specom2011-messlev.pdf}
}

@inproceedings{bertin11evaluating,
  author = {T. Bertin-Mahieux and G. Grindlay and R. J. Weiss and D. P. W. Ellis},
  title = {{Evaluating Music Sequence Models Through Missing Data}},
  booktitle = {Proc. {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
  month = may,
  pages = {177--180},
  year = 2011,
  address = {Prague, Czech Republic},
  issn = {1520-6149},
  doi = {10.1109/ICASSP.2011.5946369},
  pdf = {http://www.ee.columbia.edu/~ronw/pubs/icassp2011-imputation.pdf}
}

@article{pedregosa11scikit-learn,
  title = {scikit-learn: Machine Learning in Python},
  author = {F. Pedregosa and G. Varoquaux and A. Gramfort and V. Michel
                  and B. Thirion and O. Grisel and M. Blondel
                  and P. Prettenhofer and R. Weiss and V. Dubourg
                  and J. Vanderplas and A. Passos and D. Cournapeau
                  and M. Brucher and M. Perrot and \'{E}. Duchesnay},
  journal = {Journal of Machine Learning Research},
  volume = 12,
  month = oct,
  pages = {2825--2830},
  year = {2011},
  http = {http://jmlr.org/papers/v12/pedregosa11a.html},
  pdf = {http://www.ee.columbia.edu/~ronw/pubs/jmlr2011-scikit-learn.pdf}
}

@inproceedings{weston12lcr,
  title = {Latent Collaborative Retrieval},
  author = {J. Weston and C. Wang and R. Weiss and A. Berenzweig},
  booktitle = {Proc. International Conference on Machine Learning ({ICML})},
  month = jun,
  year = {2012},
  address = {Edinburgh, Scotland},
  http = {http://icml.cc/discuss/2012/12.html},
  pdf = {http://www.ee.columbia.edu/~ronw/pubs/icml2012-lcr.pdf}
}

@inproceedings{weston13awe,
  title = {Affinity Weighted Embedding},
  author = {J. Weston and R. Weiss and H. Yee},
  booktitle = {International Conference on Learning Representations ({ICLR})},
  month = may,
  year = {2013},
  address = {Scottsdale, USA},
  http = {http://openreview.net/document/8bc82d3f-df5e-4602-bc3d-2f6fa0196f5f},
  pdf = {http://www.ee.columbia.edu/~ronw/pubs/iclr2013-awe.pdf}
}

@inproceedings{weston13usermax,
  title = {Nonlinear Latent Factorization by Embedding Multiple User Interests},
  author = {J. Weston and R. J. Weiss and H. Yee},
  booktitle = {Proc. ACM Conference on Recommender Systems ({RecSys})},
  pages = {65--68},
  month = oct,
  year = {2013},
  address = {Hong Kong},
  doi = {10.1145/2507157.2507209},
  pdf = {http://www.ee.columbia.edu/~ronw/pubs/recsys2013-usermax.pdf}
}

@inproceedings{weston13kaos,
  title = {Learning to Rank Recommendations with the k-order Statistic Loss},
  author = {J. Weston and H. Yee and R. J. Weiss},
  booktitle = {Proc. ACM Conference on Recommender Systems ({RecSys})},
  pages = {245--248},
  month = oct,
  year = {2013},
  address = {Hong Kong},
  doi = {10.1145/2507157.2507210},
  pdf = {http://www.ee.columbia.edu/~ronw/pubs/recsys2013-kaos.pdf}
}

@inproceedings{weston14awe,
  title = {Affinity Weighted Embedding},
  author = {J. Weston and R. Weiss and H. Yee},
  booktitle = {Proc. International Conference on Machine Learning ({ICML})},
  pages = {1215--1223},
  month = jun,
  year = {2014},
  address = {Beijing, China},
  http = {http://jmlr.org/proceedings/papers/v32/weston14.html},
  pdf = {http://www.ee.columbia.edu/~ronw/pubs/icml2014-awe.pdf}
}

@inproceedings{hoshen15waveformam,
  author = {Y. Hoshen and R. J. Weiss and K. W. Wilson},
  title = {{Speech Acoustic Modeling from Raw Multichannel Waveforms}},
  booktitle = {Proc. {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
  month = apr,
  year = 2015,
  address = {Brisbane, Australia},
  pdf = {http://www.ee.columbia.edu/~ronw/pubs/icassp2015-waveformam.pdf}
}

@inproceedings{sainath15waveform_cldnn,
  title = {Learning the Speech Front-End with Raw Waveform CLDNNs},
  author = {T. N. Sainath and R. J. Weiss and A. Senior and K. W. Wilson and O. Vinyals},
  booktitle = {Proc. Interspeech},
  month = sep,
  year = 2015,
  address = {Dresden, Germany},
  pdf = {http://www.ee.columbia.edu/~ronw/pubs/interspeech2015-waveform_cldnn.pdf}
}

@inproceedings{sainath15multichannel,
  title = {Speaker Location and Microphone Spacing Invariant Acoustic Modeling from Raw Multichannel Waveforms},
  author = {T. N. Sainath and R. J. Weiss and K. W. Wilson and A. Narayanan and M. Bacchiani and A. Senior},
  booktitle = {Proc. {IEEE} Automatic Speech Recognition and Understanding Workshop ({ASRU})},
  month = dec,
  year = 2015,
  address = {Scottsdale, USA},
  pdf = {http://www.ee.columbia.edu/~ronw/pubs/asru2015-multichannel_cldnn.pdf}
}

@inproceedings{sainath16factored,
  title = {Factored Spatial and Spectral Multichannel Raw Waveform {CLDNN}s},
  author = {T. N. Sainath and R. J. Weiss and K. W. Wilson and A. Narayanan and M. Bacchiani},
  booktitle = {Proc. {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
  month = mar,
  year = 2016,
  address = {Shanghai, China},
  pdf = {http://www.ee.columbia.edu/~ronw/pubs/icassp2016-factored_cldnn.pdf}
}

@inproceedings{li16adaptive,
  title = {Neural Network Adaptive Beamforming for Robust Multichannel Speech Recognition},
  author = {B. Li and T. N. Sainath and R. J. Weiss and K. W. Wilson and M. Bacchiani},
  booktitle = {Proc. Interspeech},
  month = sep,
  year = 2016,
  address = {San Francisco, USA},
  pdf = {http://www.ee.columbia.edu/~ronw/pubs/interspeech2016-waveform_cldnn_adaptive.pdf}
}

@inproceedings{sainath16speedups,
  title = {Reducing the Computational Complexity of Multimicrophone Acoustic Models with Integrated Feature Extraction},
  author = {T. N. Sainath and A. Narayanan and R. J. Weiss and E. Variani and K. W. Wilson and M. Bacchiani and I. Shafran},
  booktitle = {Proc. Interspeech},
  month = sep,
  year = 2016,
  address = {San Francisco, USA},
  pdf = {http://www.ee.columbia.edu/~ronw/pubs/interspeech2016-waveform_cldnn_speedups.pdf}
}

@inproceedings{hershey17audiocnn,
  title = {{CNN} Architectures for Large-Scale Audio Classification},
  author = {S. Hershey and S. Chaudhuri and D. P. W. Ellis and J. F. Gemmeke and A. Jansen and R. C. Moore and M. Plakal and D. Platt and R. A. Saurous and B. Seybold and M. Slaney and R. J. Weiss and K. Wilson},
  booktitle = {Proc. {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
  month = mar,
  year = 2017,
  address = {New Orleans, USA},
  web = {https://arxiv.org/abs/1609.09430},
  pdf = {http://www.ee.columbia.edu/~ronw/pubs/icassp2017-audiocnn.pdf}
}

@article{sainath17multichannel,
  title = {Multichannel Signal Processing with Deep Neural Networks for Automatic Speech Recognition},
  author = {T. N. Sainath and R. J. Weiss and K. W. Wilson and B. Li and A. Narayanan and E. Variani and M. Bacchiani and I. Shafran and A. Senior and K. W. Chin and A. Misra and C. Kim},
  journal = {{IEEE} Transactions on Audio, Speech, and Language Processing},
  month = feb,
  year = {2017},
  publisher = {IEEE},
  doi = {10.1109/TASLP.2017.2672401},
  pdf = {http://www.ee.columbia.edu/~ronw/pubs/taslp2017-multichannel.pdf}
}

@incollection{sainath17raw,
  title = {Raw Multichannel Processing Using Deep Neural Networks},
  author = {T. N. Sainath and R. J. Weiss and K. W. Wilson and B. Li and A. Narayanan and E. Variani and M. Bacchiani and I. Shafran and A. Senior and K. W. Chin and A. Misra and C. Kim},
  editors = {Shinji Watanabe and Marc Delcroix and Florian Metze and John R. Hershey},
  booktitle = {New Era for Robust Speech Recognition: Exploiting Deep Learning},
  publisher = {Springer},
  year = {2017},
  note = {to appear.},
  pdf = {http://www.ee.columbia.edu/~ronw/pubs/jsalt2017-raw.pdf}
}