import { MDBRow, MDBCol } from 'mdbreact';
import React from 'react';
import Microphone from '../assets/microphone.png';
import Overview from '../assets/overview.jpg';

const abstractText = "Automatic lyric transcription (ALT) is a nascent field of study attracting increasing interest from both the speech and music information retrieval communities, given its significant application potential. However, ALT with audio data alone is a notoriously difficult task due to instrumental accompaniment and musical constraints resulting in degradation of both the phonetic cues and the intelligibility of sung lyrics. To tackle this challenge, we propose the MultiModal Automatic Lyric Transcription system (MM-ALT), together with a new dataset, N20EM, which consists of audio recordings, videos of lip movements, and inertial measurement unit (IMU) data of an earbud worn by the performing singer. We first adapt the wav2vec 2.0 framework from automatic speech recognition (ASR) to the ALT task. We then propose a video-based ALT method and an IMU-based voice activity detection (VAD) method. In addition, we put forward the Residual Cross Attention (RCA) mechanism to fuse data from the three modalities (i.e., audio, video, and IMU). Experiments show the effectiveness of our proposed MM-ALT system, especially in terms of noise robustness."

class Landing extends React.Component {

  state = {

  }

  componentDidMount = async () => {
  }

  render() {

    const styles = {
      landing: {
        padding: '10vh 15vw 5vh 15vw'
      },
    };

    return (
      <div style={styles.landing}>
        <MDBRow>

          <MDBCol md="6">
            <MDBCol md="12">
              <div className="header">
                {"MM-ALT: a MultiModal Automatic Lyric Transcription System"}
              </div>

              <div style={{ padding: '2vh 0vh' }} className="credits">
                Xiangming Gu*, Longshen Ou*, Danielle Ong, Ye Wang
              </div>
              <div>
                *Both authers contributed equally to the research
              </div>
            </MDBCol>
          </MDBCol>

          <MDBCol style={{ display: 'flex', justifyContent: 'flex-end' }} md="6">
            <img
              src={Microphone}
              alt=''
              style={{height: '35vh'}}
            />
          </MDBCol>

        </MDBRow>

        <MDBRow style={{ marginTop: '2vh', marginLeft: '3px' }}>
          <MDBCol md="12">
            <div className="sectionHeader">
              {"Abstract"}
            </div>
          </MDBCol>

          <MDBCol style={{ marginTop: '1vh' }} md="12">
            <div className="textDescription">
              {abstractText}
            </div>
          </MDBCol>
        </MDBRow>

        <MDBRow style={{ margin: '5vh 0vh 5vh 3px' }}>
          <MDBCol md="12">
            <div className="sectionHeader">
              {"Overview"}
            </div>
          </MDBCol>

          <MDBCol md="12">
            <div style={{ display: 'flex', justifyContent: 'center' }}>
              <img
                src={Overview}
                alt=''
                style={{ width: '90%', padding: '3vh 0vh' }}
              />
            </div>
          </MDBCol>

          <MDBCol style={{ marginTop: '1vh' }} md="12">
            <div className="textDescription">
              {"We present the MultiModal Automatic Lyric Transcription system (MM-ALT), which utilizes three modalities of input: audio, video, and signals from wearable IMU sensors. To facilitate building the system, we curate the N20EM dataset for multimodal lyric transcription. We create a group of models on this dataset that can perform multimodal lyric transcription with varying combinations of modalities, obtaining a minimum word error rate (WER) of 12.71\%. We further reveal an increase in system robustness by introducing additional modalities. With severe perturbations of musical accompaniments (-10 dB SNR), our system can achieve 27.04\% absolute lower WER compared to its audio-only counterparts. "}
            </div>

            <div className="textDescription" style={{ marginTop: '2vh' }}>
              {"We initialize two new tasks: lyric lipreading and IMU-based voice activity detection (VAD). In the lyric lipreading task, we attempt to recognize lyrics in singing utilizing only video information. Our video encoder is the first attempt to retrieve language-related information from singing recordings without the help of audio input. As to the IMU-VAD task, our IMU encoder is the first attempt at building a frame-level VAD system solely from motion data captured by a wearable IMU device. Our experiments elucidate the correlation between the IMU and audio modalities. "}
            </div>

            <div className="textDescription" style={{ marginTop: '2vh' }}>
              {"We propose Residual Cross Attention (RCA), a new feature fusion method to better fuse the multimodal features using self-attention and cross-attention mechanisms. We demonstrate the effectiveness of this new modality fusion method in our ALT system by comparing it with various feature fusion methods."}
            </div>

            {/* <div className="textDescription" style={{ marginTop: '2vh' }}>
              {"We initialize a new task referred to as lyric lipreading. In this task, we attempt to recognize lyrics in singing utilizing only video information. This is the first attempt to retrieve language-related information from singing recordings without the help of audio input."}
            </div> */}
          </MDBCol>
        </MDBRow>
      </div>
    );
  }
}

export default Landing;
