-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Feat : 학교 홈페이지 리뉴얼로인한 교직원 스크랩 및 스크랩간 직위 추가 #223
Changes from 28 commits
cdab7ff
48a182e
39d0b7c
7d1e6db
8db54c6
9c64b73
0fd0dc8
28a7557
961b896
fd4025e
cca4bd4
c8c22e1
a20e5de
5c915ee
f6bc03e
49ca89b
3ae1d1a
38654a6
8e921da
2a84b06
4a5f28a
708f4bf
9ac2010
2b36510
30c6675
8ce7b42
3727f63
ffd86bd
49c684b
eb9cded
5d3a715
666db2c
b607552
980fe0f
143a5e5
3049c30
47b08f9
05bdb5d
c8f848e
48b48c1
ec2fafb
5cf1377
518ac33
bcadaba
4216676
90f5958
3954e23
5afe41f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
package com.kustacks.kuring.common.utils.converter; | ||
|
||
import java.util.regex.Pattern; | ||
|
||
public class EmailConverter { | ||
private static final Pattern AT_PATTERN = Pattern.compile("\\s+at\\s+"); | ||
private static final Pattern DOT_PATTERN = Pattern.compile("\\s+dot\\s+"); | ||
private static final Pattern EMAIL_PATTERN = Pattern.compile("^[a-zA-Z0-9_!#$%&'\\*+/=?{|}~^.-]+@[a-zA-Z0-9.-]+$"); | ||
|
||
private static final String KONKUK_DOMAIN = "@konkuk.ac.kr"; | ||
|
||
public static String convertValidEmail(String email) { | ||
if (email == null || email.isBlank()) { | ||
return ""; // 빈 입력 처리 | ||
} | ||
|
||
//여러 이메일인 경우 있으니 분리. | ||
String[] emailGroups = email.split("[/,]"); | ||
//정상 구조가 아닌 경우 구조 정상화 | ||
for (int i = 0; i < emailGroups.length; i++) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 코드를 조금 다시 재구성 해봤는데 확인 부탁해요~ public class EmailConverter {
private static final Pattern AT_PATTERN = Pattern.compile("\\s+at\\s+");
private static final Pattern DOT_PATTERN = Pattern.compile("\\s+dot\\s+");
private static final Pattern EMAIL_PATTERN = Pattern.compile("^[a-zA-Z0-9_!#$%&'\\*+/=?{|}~^.-]+@[a-zA-Z0-9.-]+$");
private static final String KONKUK_DOMAIN = "@konkuk.ac.kr";
private static final String EMPTY_EMAIL = "";
public static String convertValidEmail(String email) {
if (isNullOrEmpty(email)) {
return EMPTY_EMAIL;
}
String[] emailGroups = splitEmails(email);
String[] normalizedEmails = normalizeEmails(emailGroups);
return selectPreferredEmail(normalizedEmails);
}
private static boolean isNullOrEmpty(String str) {
return str == null || str.isBlank();
}
private static String[] splitEmails(String email) {
return email.split("[/,]");
}
private static String[] normalizeEmails(String[] emailGroups) {
return Arrays.stream(emailGroups)
.map(EmailConverter::normalizeEmail)
.toArray(String[]::new);
}
private static String normalizeEmail(String email) {
if (isNullOrEmpty(email)) {
return EMPTY_EMAIL;
}
if (EMAIL_PATTERN.matcher(email).matches()) {
return email;
}
if (containsSubstitutePatterns(email)) {
return replaceSubstitutePatterns(email);
}
return EMPTY_EMAIL;
}
private static boolean containsSubstitutePatterns(String email) {
return DOT_PATTERN.matcher(email).find() && AT_PATTERN.matcher(email).find();
}
private static String replaceSubstitutePatterns(String email) {
return email.replaceAll(DOT_PATTERN.pattern(), ".")
.replaceAll(AT_PATTERN.pattern(), "@");
}
private static String selectPreferredEmail(String[] emails) {
return Arrays.stream(emails)
.filter(email -> !email.isBlank())
.filter(email -> email.endsWith(KONKUK_DOMAIN))
.findFirst()
.orElseGet(() -> emails.length > 0 ? emails[0] : EMPTY_EMAIL);
}
} |
||
emailGroups[i] = normalizeEmail(emailGroups[i]); | ||
} | ||
|
||
//여러 이메일 중 konkuk을 우선 선택, 없으면 첫번째 내용 | ||
return selectEmail(emailGroups); | ||
} | ||
|
||
private static String normalizeEmail(String email) { | ||
if (email == null || email.isBlank()) { | ||
return ""; | ||
} | ||
|
||
// 정상 이메일인지 확인 | ||
if (EMAIL_PATTERN.matcher(email).matches()) { | ||
return email; | ||
} | ||
|
||
// "@", "." 대신 "at", "dot"으로 되어있는 경우 변환 | ||
if (DOT_PATTERN.matcher(email).find() && AT_PATTERN.matcher(email).find()) { | ||
return email.replaceAll(DOT_PATTERN.pattern(), ".") | ||
.replaceAll(AT_PATTERN.pattern(), "@"); | ||
} | ||
|
||
// 기타 이상한 형식은 빈공백으로 저장 | ||
return ""; | ||
} | ||
|
||
// Konkuk 도메인 우선 선택 | ||
private static String selectEmail(String[] emails) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 인자로 받는 emails가 0개인 일은 없나요? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 현재는 빈 공백이라도 입력되도록 구현되어있어서 괜찮다고 넘긴 부분인데, 이 메서드의 입장에서 봤을 때 불확실한 믿을을 가져야 하는거 같아서 검증할 수 있도록 추가하겠습니다! |
||
for (String email : emails) { | ||
if (email.endsWith(KONKUK_DOMAIN)) { | ||
return email; | ||
} | ||
} | ||
return emails[0]; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
package com.kustacks.kuring.common.utils.converter; | ||
|
||
import java.util.regex.Pattern; | ||
|
||
public class PhoneNumberConverter { | ||
|
||
private static final Pattern LAST_FOUR_NUMBER_PATTERN = Pattern.compile("\\d{4}"); | ||
private static final Pattern FULL_NUMBER_PATTERN = Pattern.compile("02-\\d{3,4}-\\d{4}"); | ||
private static final Pattern FULL_NUMBER_WITH_PARENTHESES_PATTERN = Pattern.compile("02[)]\\d{3,4}-\\d{4}"); | ||
|
||
public static String convertFullExtensionNumber(String number) { | ||
if (number == null || number.isBlank()) { | ||
return ""; | ||
} | ||
if (LAST_FOUR_NUMBER_PATTERN.matcher(number).matches()) { | ||
return "02-450-" + number; | ||
} | ||
if (FULL_NUMBER_PATTERN.matcher(number).matches()) { | ||
return number; | ||
} | ||
if (FULL_NUMBER_WITH_PARENTHESES_PATTERN.matcher(number).matches()) { | ||
return number.replace(")", "-"); | ||
} | ||
|
||
//기타 이상한 형식은 빈공백으로 저장 ex. 218) 이게뭔데; | ||
return ""; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,6 @@ | ||
package com.kustacks.kuring.worker.parser.staff; | ||
|
||
import com.kustacks.kuring.worker.scrap.deptinfo.DeptInfo; | ||
import com.kustacks.kuring.worker.scrap.deptinfo.art_design.CommunicationDesignDept; | ||
import com.kustacks.kuring.worker.scrap.deptinfo.art_design.LivingDesignDept; | ||
import com.kustacks.kuring.worker.scrap.deptinfo.real_estate.RealEstateDept; | ||
import lombok.NoArgsConstructor; | ||
import lombok.extern.slf4j.Slf4j; | ||
|
@@ -18,33 +16,22 @@ public class EachDeptStaffHtmlParser extends StaffHtmlParserTemplate { | |
|
||
@Override | ||
public boolean support(DeptInfo deptInfo) { | ||
return !(deptInfo instanceof RealEstateDept) && | ||
!(deptInfo instanceof LivingDesignDept) && | ||
!(deptInfo instanceof CommunicationDesignDept); | ||
return !(deptInfo instanceof RealEstateDept); | ||
} | ||
|
||
protected Elements selectStaffInfoRows(Document document) { | ||
Element table = document.select(".photo_intro").get(0); | ||
return table.getElementsByTag("dl"); | ||
return document.select(".row"); | ||
} | ||
|
||
protected String[] extractStaffInfoFromRow(Element row) { | ||
Elements infos = row.getElementsByTag("dd"); | ||
|
||
// 교수명, 직위, 세부전공, 연구실, 연락처, 이메일 순으로 파싱 | ||
// 연구실, 연락처 정보는 없는 경우가 종종 있으므로, childNode접근 전 인덱스 체크하는 로직을 넣었음 | ||
String name = infos.get(0).getElementsByTag("span").get(1).text(); | ||
|
||
String jobPosition = String.valueOf(infos.get(1).childNodeSize() < 2 ? "" : infos.get(1).childNode(1)); | ||
if (jobPosition.contains("명예") || jobPosition.contains("대우") || jobPosition.contains("휴직") || !jobPosition.contains("교수")) { | ||
log.info("스크래핑 스킵 -> {} 교수", name); | ||
return new String[]{}; | ||
} | ||
|
||
String major = infos.get(2).childNodeSize() < 2 ? "" : String.valueOf(infos.get(2).childNode(1)); | ||
String lab = infos.get(3).childNodeSize() < 2 ? "" : String.valueOf(infos.get(3).childNode(1)); | ||
String phone = infos.get(4).childNodeSize() < 2 ? "" : String.valueOf(infos.get(4).childNode(1)); | ||
String email = infos.get(5).getElementsByTag("a").get(0).text(); | ||
return new String[]{name, major, lab, phone, email}; | ||
String name = row.select(".info .title .name").text(); | ||
|
||
Elements detailElement = row.select(".detail"); | ||
String jobPosition = detailElement.select(".ico1 dd").text().trim(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 여기의 모든 html요소들이 항상 존재할까요? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 말씀해주신 부분 알아보니 null값은 나오지 않도록 Jsoup에서 지원하는 것 같아요! 예를들어
두 경우 모두 직접 테스트 해본 결과 null값이 아닌 ""과 같은 빈 공백이 배열에 저장됩니다. 실제, 데이터가 없는 학과(ex. 수의예과의 경우 1번의 경우에 해당되는걸 확인했습니다. 혹시나 싶어 Jsoup 라이브러리의 select메서드를 훑어봤을 때 찾는 요소가 없다면 빈 Elements 객체를 반환하는 걸로 보입니다. public static Elements select(String query, Iterable<Element> roots) {
Validate.notEmpty(query);
Validate.notNull(roots);
Evaluator evaluator = QueryParser.parse(query);
Elements elements = new Elements();
IdentityHashMap<Element, Boolean> seenElements = new IdentityHashMap();
Iterator var5 = roots.iterator();
while(var5.hasNext()) {
Element root = (Element)var5.next();
Elements found = select(evaluator, root);
Iterator var8 = found.iterator();
while(var8.hasNext()) {
Element el = (Element)var8.next();
if (seenElements.put(el, Boolean.TRUE) == null) {
elements.add(el);
}
}
}
return elements;
} 마찬가지 text() 메서드 또한 빈 StringBuilder 객체를 생성하고 사용하기에 값이 없다면 그대로 빈 공백이 출력되도록 하는거 같습니다. public String text() {
StringBuilder sb = StringUtil.borrowBuilder();
Element element;
for(Iterator var2 = this.iterator(); var2.hasNext(); sb.append(element.text())) {
element = (Element)var2.next();
if (sb.length() != 0) {
sb.append(" ");
}
}
return StringUtil.releaseBuilder(sb);
} 솔직하게 말하자면 잠깐 고민했던 부분인데 일단 돌아가길래 뒀던거 같습니다 하하...😂 |
||
String major = detailElement.select(".ico2 dd").text().trim(); | ||
String lab = detailElement.select(".ico3 dd").text().trim(); | ||
String extensionNumber = detailElement.select(".ico4 dd").text().trim(); | ||
String email = detailElement.select(".ico5 dd").text().trim(); | ||
return new String[]{name, jobPosition, major, lab, extensionNumber, email}; | ||
} | ||
} |
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,30 +7,30 @@ | |
import org.jsoup.select.Elements; | ||
import org.springframework.stereotype.Component; | ||
|
||
import java.util.HashMap; | ||
import java.util.Map; | ||
|
||
@Component | ||
public class RealEstateStaffHtmlParser extends StaffHtmlParserTemplate { | ||
|
||
@Override | ||
public boolean support(DeptInfo deptInfo) { | ||
return deptInfo instanceof RealEstateDept; | ||
} | ||
|
||
protected Elements selectStaffInfoRows(Document document) { | ||
Element table = document.select(".sub0201_list").get(0).getElementsByTag("ul").get(0); | ||
return table.getElementsByTag("li"); | ||
return document.select(".row"); | ||
} | ||
|
||
protected String[] extractStaffInfoFromRow(Element row) { | ||
Element content = row.select(".con").get(0); | ||
|
||
String name = content.select("dl > dt > a > strong").get(0).text(); | ||
String major = String.valueOf(content.select("dl > dd").get(0).childNode(4)).replaceFirst("\\s", "").trim(); | ||
|
||
Element textMore = content.select(".text_more").get(0); | ||
|
||
String lab = String.valueOf(textMore.childNode(4)).split(":")[1].replaceFirst("\\s", "").trim(); | ||
String phone = String.valueOf(textMore.childNode(6)).split(":")[1].replaceFirst("\\s", "").trim(); | ||
String email = textMore.getElementsByTag("a").get(0).text(); | ||
return new String[]{name, major, lab, phone, email}; | ||
String name = row.select(".info .title .name").text(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 이쪽 함수도 NPE가능성이 있어서 try |
||
|
||
Elements detalTagElement = row.select(".detail"); | ||
String jobPosition = detalTagElement.select("dt:contains(직위) + dd").text(); | ||
String major = detalTagElement.select("dt:contains(연구분야) + dd").text().trim(); | ||
String lab = detalTagElement.select("dt:contains(연구실) + dd").text().trim(); | ||
String extensionNumber = detalTagElement.select("dt:contains(연락처) + dd").text().trim(); | ||
String email = detalTagElement.select("dt:contains(이메일) + dd").text().trim(); | ||
return new String[]{name, jobPosition, major, lab, extensionNumber, email}; | ||
} | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,20 +4,13 @@ | |
import com.kustacks.kuring.common.exception.code.ErrorCode; | ||
import com.kustacks.kuring.worker.scrap.client.NormalJsoupClient; | ||
import com.kustacks.kuring.worker.scrap.deptinfo.DeptInfo; | ||
import com.kustacks.kuring.worker.scrap.deptinfo.art_design.CommunicationDesignDept; | ||
import com.kustacks.kuring.worker.scrap.deptinfo.art_design.LivingDesignDept; | ||
import com.kustacks.kuring.worker.scrap.deptinfo.real_estate.RealEstateDept; | ||
import org.jsoup.nodes.Document; | ||
import org.jsoup.nodes.Element; | ||
import org.springframework.beans.factory.annotation.Value; | ||
import org.springframework.stereotype.Component; | ||
import org.springframework.web.util.UriComponentsBuilder; | ||
|
||
import java.io.IOException; | ||
import java.util.HashMap; | ||
import java.util.LinkedList; | ||
import java.util.List; | ||
import java.util.Map; | ||
|
||
@Component | ||
public class EachDeptStaffApiClient implements StaffApiClient { | ||
|
@@ -32,49 +25,30 @@ public EachDeptStaffApiClient(NormalJsoupClient normalJsoupClient) { | |
this.jsoupClient = normalJsoupClient; | ||
} | ||
|
||
/* | ||
TODO: 만약, 학과별로 다른 API Client를 구성해야 한다면 support 구현 필요.(현재는 교직원 스크랩을 위한 모든 API 클래이언트 스펙 동일, 파싱에서 분리) [2024.11.28 김한주] | ||
*/ | ||
@Override | ||
public boolean support(DeptInfo deptInfo) { | ||
return !(deptInfo instanceof RealEstateDept) && | ||
!(deptInfo instanceof LivingDesignDept) && | ||
!(deptInfo instanceof CommunicationDesignDept); | ||
return true; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 부동산 학과랑 리빙디자인 학과에서도 잘 동작함을 확인잘 해주셨쥬? |
||
} | ||
|
||
@Override | ||
public List<Document> getHTML(DeptInfo deptInfo) throws InternalLogicException { | ||
return deptInfo.getProfessorForumIds().stream() | ||
.flatMap(professorForumId -> getProfessorHtmlById(professorForumId).stream()) | ||
return deptInfo.getStaffSiteIds().stream() | ||
.flatMap(siteId -> getProfessorHtmlByDeptAndSiteId(deptInfo.getStaffSiteName(), siteId).stream()) | ||
.toList(); | ||
} | ||
|
||
private List<Document> getProfessorHtmlById(String professorForumId) { | ||
private List<Document> getProfessorHtmlByDeptAndSiteId(String siteName, int siteId) { | ||
LinkedList<Document> documents = new LinkedList<>(); | ||
|
||
String url = buildProfessorInfoUrl(professorForumId); | ||
String url = buildDeptStaffPageUrl(siteName, siteId); | ||
Document document = getDocument(url); | ||
documents.add(document); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 원레 여기있던 반복문이 그 각 페이지별로 한번씩 요청했던걸로 기억해요! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 넵 제가 확인해본 바로는 한 페이지 안에 모든 교수님 정보가 포함되어 있습니다! 대신에 siteId에 따라서 전임교수, 명예교수, 겸임교수, 강사 등이 퍼져있는 경우도 있고, 한 페이지에 합쳐져 있는 경우도 있고, 수의학과/수의예과 같은 케이스도 있습니당 혹쉬 이것들도 siteId를 추가할까욥? |
||
|
||
int totalPageNum = getTotalPageNumber(document); | ||
for (int pageNumber = 2; pageNumber <= totalPageNum; pageNumber++) { | ||
documents.add(parseDocumentByPageNumber(url, pageNumber)); | ||
} | ||
|
||
return documents; | ||
} | ||
|
||
private Document parseDocumentByPageNumber(String url, int pageNumber) { | ||
try { | ||
Map<String, String> requestBody = new HashMap<>(); | ||
requestBody.put("pageNum", String.valueOf(pageNumber)); | ||
return jsoupClient.post(url, STAFF_SCRAP_TIMEOUT, requestBody); | ||
} catch (IOException e) { | ||
throw new InternalLogicException(ErrorCode.STAFF_SCRAPER_CANNOT_SCRAP, e); | ||
} | ||
} | ||
|
||
private static int getTotalPageNumber(Document document) { | ||
Element pageNumHiddenInput = document.getElementById("totalPageCount"); | ||
return Integer.parseInt(pageNumHiddenInput.val()); | ||
} | ||
|
||
private Document getDocument(String url) { | ||
try { | ||
|
@@ -84,7 +58,8 @@ private Document getDocument(String url) { | |
} | ||
} | ||
|
||
private String buildProfessorInfoUrl(String pfForumId) { | ||
return UriComponentsBuilder.fromUriString(baseUrl).queryParam("pfForumId", pfForumId).toUriString(); | ||
private String buildDeptStaffPageUrl(String department, int siteId) { | ||
return baseUrl.replaceAll("\\{department\\}", department) | ||
.replace("{siteId}", String.valueOf(siteId)); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
함수 이름이 convertValidEmail 인 상황인데 "" 이 나오면 이건 ValidEmail로 간주하는 것 일까요?
방어 로직이 있는거 자체는 좋아요!, 다만 호출하는쪽에서 null과 blank가 아닌 경우에 converting을 시도하는것도 좋을수도??
의견입니다!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
동의합니다~! 조금 확장해서 EmailSupporter로 만드는게 더 좋을 수 있겠다는 생각이 드네용!
외부에서 null과 blank를 체크하는 메서드를 호출(isNullOrBlank)한 후 결과에 따라 convertEmail 메서드를 통해 변환하도록 구현하는게 조금 더 명확할거 같다고 생각합니다!
이런 방향성이면 PhoneNumberConverter도 수정이 필요할거 같네욤